In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import warnings 
warnings.filterwarnings('ignore')
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


In [2]:
def wrangle(filepath):
    df = pd.read_csv(filepath)
    
    obesity_order = {
        'Insufficient_Weight': 0,
        'Normal_Weight': 1,
        'Overweight_Level_I': 2,
        'Overweight_Level_II': 3,
        'Obesity_Type_I': 4,
        'Obesity_Type_II': 5,
        'Obesity_Type_III': 6
    }
    # Consumption of alcohol (CALC
    calc_order={
        'no':0,
        'Sometimes':1,
        'Frequently':2    
    }

    #  Calories consumption monitoring (SCC)
    yes_no_order={'no':0,'yes':1}

    # Consumption of food between meals (CAEC)
    caec_order={
        'no':0,
        'Sometimes':1,
        'Frequently':2,
        'Always':3
    }

    transpot_order={
        'Walking':0,
        'Bike':1,
        'Motorbike':2,
        'Public_Transportation':3,
        'Automobile':4

    }
    
    gender_order={'Female':0,'Male':1}
    
    df['Gender']=df['Gender'].replace(gender_order)
    df['NObeyesdad']=df['NObeyesdad'].replace(obesity_order)
    df['CALC']=df['CALC'].replace(calc_order)
    df['SCC']=df['SCC'].replace(yes_no_order)
    df['SMOKE']=df['SMOKE'].replace(yes_no_order)
    df['family_history_with_overweight']=df['family_history_with_overweight'].replace(yes_no_order)
    df['FAVC']=df['FAVC'].replace(yes_no_order)
    df['CAEC']=df['CAEC'].replace(caec_order)
    df['MTRANS']=df['MTRANS'].replace(transpot_order)
    
    return df

In [3]:
df=wrangle("/kaggle/input/playground-series-s4e2/train.csv")
df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,1,24.443011,1.699998,81.66995,1,1,2.0,2.983297,1,0,2.763573,0,0.0,0.976473,1,3,3
1,1,0,18.0,1.56,57.0,1,1,2.0,3.0,2,0,2.0,0,1.0,1.0,0,4,1
2,2,0,18.0,1.71146,50.165754,1,1,1.880534,1.411685,1,0,1.910378,0,0.866045,1.673584,0,3,0
3,3,0,20.952737,1.71073,131.274851,1,1,3.0,3.0,1,0,1.674061,0,1.467863,0.780199,1,3,6
4,4,1,31.641081,1.914186,93.798055,1,1,2.679664,1.971472,1,0,1.979848,0,1.967973,0.931721,1,3,3


In [4]:
target_column = 'NObeyesdad'
X = df.drop(['id','NObeyesdad'], axis=1)
y = df[target_column]

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['Gender', 'CAEC', 'SMOKE', 'SCC', 'MTRANS'])



In [7]:

# Define the parameter grid for grid search
param_grid = {
    'objective': ['multiclass'],
    'num_class': [7],
    'boosting_type': ['gbdt'],
    'num_leaves': [15, 31, 50],
    'learning_rate': [0.05, 0.1, 0.2],
    'feature_fraction': [0.8, 0.9, 1.0]
}

In [8]:
# Create the LightGBM model
lgb_model = lgb.LGBMClassifier()

# Use Grid Search with cross-validation
grid_search = GridSearchCV(lgb_model, param_grid, cv=5, scoring='accuracy', verbose=0)
grid_search.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2039
[LightGBM] [Info] Number of data points in the train set: 13284, number of used features: 16
[LightGBM] [Info] Start training from score -2.117182
[LightGBM] [Info] Start training from score -1.911577
[LightGBM] [Info] Start training from score -2.145728
[LightGBM] [Info] Start training from score -2.112191
[LightGBM] [Info] Start training from score -1.948397
[LightGBM] [Info] Start training from score -1.857563
[LightGBM] [Info] Start training from score -1.633359
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2047
[LightGBM] [Info] Number of data points in the train set: 13285, number o

In [9]:
# Get the best parameters from the grid search
best_params = grid_search.best_params_

In [10]:
# Train the model with the best parameters
best_lgb_model = lgb.LGBMClassifier(**best_params)
best_lgb_model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2043
[LightGBM] [Info] Number of data points in the train set: 16606, number of used features: 16
[LightGBM] [Info] Start training from score -2.117117
[LightGBM] [Info] Start training from score -1.911230
[LightGBM] [Info] Start training from score -2.145531
[LightGBM] [Info] Start training from score -2.112625
[LightGBM] [Info] Start training from score -1.948141
[LightGBM] [Info] Start training from score -1.857720
[LightGBM] [Info] Start training from score -1.633574


In [11]:
# Make predictions on the test set
y_pred = best_lgb_model.predict(X_test)



In [12]:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Best Hyperparameters: {best_params}')
print(f'Accuracy: {accuracy}')

Best Hyperparameters: {'boosting_type': 'gbdt', 'feature_fraction': 0.9, 'learning_rate': 0.1, 'num_class': 7, 'num_leaves': 15, 'objective': 'multiclass'}
Accuracy: 0.9053468208092486


# TEST

In [13]:
def prediction(filepath):
    df = pd.read_csv(filepath)
    
    obesity_order = {
        'Insufficient_Weight': 0,
        'Normal_Weight': 1,
        'Overweight_Level_I': 2,
        'Overweight_Level_II': 3,
        'Obesity_Type_I': 4,
        'Obesity_Type_II': 5,
        'Obesity_Type_III': 6
    }
    # Consumption of alcohol (CALC
    calc_order={
        'no':0,
        'Sometimes':1,
        'Frequently':2,
        'Always':3
    }

    #  Calories consumption monitoring (SCC)
    yes_no_order={'no':0,'yes':1}

    # Consumption of food between meals (CAEC)
    caec_order={
        'no':0,
        'Sometimes':1,
        'Frequently':2,
        'Always':3
    }

    transpot_order={
        'Walking':0,
        'Bike':1,
        'Motorbike':2,
        'Public_Transportation':3,
        'Automobile':4

    }
    
    gender_order={'Female':0,'Male':1}
    
    #drop id
    df=df.drop(['id'], axis=1)
    
    df['Gender']=df['Gender'].replace(gender_order)
    df['CALC']=df['CALC'].replace(calc_order)
    df['SCC']=df['SCC'].replace(yes_no_order)
    df['SMOKE']=df['SMOKE'].replace(yes_no_order)
    df['family_history_with_overweight']=df['family_history_with_overweight'].replace(yes_no_order)
    df['FAVC']=df['FAVC'].replace(yes_no_order)
    df['CAEC']=df['CAEC'].replace(caec_order)
    df['MTRANS']=df['MTRANS'].replace(transpot_order)
    
    #predicton
    y_pred = best_lgb_model.predict(df)
    
    
    
    return y_pred

In [14]:
prediction=prediction("/kaggle/input/playground-series-s4e2/test.csv")

