# Infos

**id** : A unique identifier for each individual in the dataset.

**Gender** : The individual's gender, indicating whether they are male or female.

**Age** : The age of the individual, representing their age in years.

**Height** : The height of the individual, typically measured in meters.

**Weight** : The weight of the individual, typically measured in kilograms.

**family_history_with_overweight** : Indicates whether there is a family history of overweight for the individual (yes/no).

**FAVC** : Stands for "Frequency of consuming high caloric food," representing how often the individual consumes high-calorie foods (yes/no).

**FCVC** : Stands for "Frequency of consuming vegetables," representing how often the individual consumes vegetables.

**NCP** : Stands for "Number of main meals," indicating the number of main meals the individual consumes daily.

**CAEC** : Stands for "Consumption of food between meals," representing the frequency of consuming food between meals.

**SMOKE** : Indicates whether the individual smokes or not (yes/no).

**CH2O** : Represents the amount of water consumption for the individual.

**SCC** : Stands for "Calories consumption monitoring," indicating whether the individual monitors their calorie consumption (yes/no).

**FAF** : Stands for "Physical activity frequency," representing the frequency of the individual's physical activities.

**TUE** : Stands for "Time using technology devices," indicating the amount of time the individual spends using technology devices.

**CALC** : Stands for "Consumption of alcohol," representing the frequency of alcohol consumption.

**MTRANS** : Stands for "Mode of transportation," indicating the mode of transportation the individual uses.

**NObeyesdad** : The target variable, representing the obesity risk category of the individual. It has multiple classes such as 'Overweight_Level_II', 'Normal_Weight', 'Insufficient_Weight', 'Obesity_Type_III', 'Obesity_Type_II', 'Overweight_Level_I', and 'Obesity_Type_I'.

# Import

In [1]:
import pandas as pd
import numpy as np
import json

from modules.features_encoding import FeaturesEncoding

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, FunctionTransformer, LabelEncoder
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')


In [2]:
train = pd.read_csv('data/train.csv', index_col = 'id')

# Exploration

In [3]:
train.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20758 entries, 0 to 20757
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          20758 non-null  object 
 1   Age                             20758 non-null  float64
 2   Height                          20758 non-null  float64
 3   Weight                          20758 non-null  float64
 4   family_history_with_overweight  20758 non-null  object 
 5   FAVC                            20758 non-null  object 
 6   FCVC                            20758 non-null  float64
 7   NCP                             20758 non-null  float64
 8   CAEC                            20758 non-null  object 
 9   SMOKE                           20758 non-null  object 
 10  CH2O                            20758 non-null  float64
 11  SCC                             20758 non-null  object 
 12  FAF                             20758

In [5]:
def report(data) : 
    report = pd.DataFrame(index = data.columns)
    report['type'] = data.dtypes
    report['count'] = data.count()
    report['nunique'] = data.nunique()
    report['%unique'] = report['nunique'] / len(data) * 100
    report['null'] = data.isnull().sum()
    report['%null'] = report['null'] / len(data) * 100
    report['min'] = data.min()
    report['max'] = data.max()
    return report
report(train)

Unnamed: 0,type,count,nunique,%unique,null,%null,min,max
Gender,object,20758,2,0.009635,0,0.0,Female,Male
Age,float64,20758,1703,8.204066,0,0.0,14.0,61.0
Height,float64,20758,1833,8.83033,0,0.0,1.45,1.975663
Weight,float64,20758,1979,9.533674,0,0.0,39.0,165.057269
family_history_with_overweight,object,20758,2,0.009635,0,0.0,no,yes
FAVC,object,20758,2,0.009635,0,0.0,no,yes
FCVC,float64,20758,934,4.49947,0,0.0,1.0,3.0
NCP,float64,20758,689,3.319202,0,0.0,1.0,4.0
CAEC,object,20758,4,0.01927,0,0.0,Always,no
SMOKE,object,20758,2,0.009635,0,0.0,no,yes


In [6]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,20758.0,23.841804,5.688072,14.0,20.0,22.815416,26.0,61.0
Height,20758.0,1.700245,0.087312,1.45,1.631856,1.7,1.762887,1.975663
Weight,20758.0,87.887768,26.379443,39.0,66.0,84.064875,111.600553,165.057269
FCVC,20758.0,2.445908,0.533218,1.0,2.0,2.393837,3.0,3.0
NCP,20758.0,2.761332,0.705375,1.0,3.0,3.0,3.0,4.0
CH2O,20758.0,2.029418,0.608467,1.0,1.792022,2.0,2.549617,3.0
FAF,20758.0,0.981747,0.838302,0.0,0.008013,1.0,1.587406,3.0
TUE,20758.0,0.616756,0.602113,0.0,0.0,0.573887,1.0,2.0


# Transformers

In [7]:
numeric_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
FeatureScaler = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), numeric_features)
    ],
    remainder='passthrough' 
)

# Optuna

In [8]:
X = train.copy()

lb = LabelEncoder()
y = lb.fit_transform(X.pop('NObeyesdad'))

SEED = 42
SPLITS = 5
TRIALS = 200
SKF = StratifiedKFold(n_splits = SPLITS, random_state = SEED, shuffle = True)

## XGB

In [9]:
params_xgb = {
        'random_state' : SEED,
        'tree_method' : 'hist',
}
def xgb_objective(trial):

    params = {
        'eta' : trial.suggest_float('eta', .001, .3, log = True),
        'max_depth' : trial.suggest_int('max_depth', 2, 30),
        'subsample' : trial.suggest_float('subsample', .5, 1),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', .1, 1),
        'min_child_weight' : trial.suggest_float('min_child_weight', .1, 20, log = True),
        'reg_lambda' : trial.suggest_float('reg_lambda', .01, 20, log = True),
        'reg_alpha' : trial.suggest_float('reg_alpha', .01, 10, log = True),
        'n_estimators' : trial.suggest_int('n_estimators', 10, 500),
        **params_xgb

    }
    
    optuna_model = make_pipeline(
        FunctionTransformer(FeaturesEncoding),
        FeatureScaler,
        XGBClassifier(**params)
    )
    
    optuna_score = cross_val_score(optuna_model, X, y, scoring='accuracy', cv=SKF)
    
    return np.mean(optuna_score)

In [10]:
xgb_study = optuna.create_study(direction = 'maximize')
xgb_study.optimize(xgb_objective,n_trials=TRIALS, n_jobs=-1, show_progress_bar=True)
print("")
print(f'scores : {xgb_study.best_value}, params : {xgb_study.best_params} ')
with open('json/xgb.json', 'w') as json_file:
    json.dump(xgb_study.best_params, json_file, indent=4)

[I 2024-02-09 18:57:35,117] A new study created in memory with name: no-name-f8b1bdef-66be-46a2-914c-5a12e335a715


  0%|          | 0/200 [00:00<?, ?it/s]

## LGBM

In [None]:
params_lgbm = {
    'boosting_type': 'gbdt',
    'random_state': SEED
}
def lgbm_objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        **params_lgbm
        
    }
    
    optuna_model = make_pipeline(
        FunctionTransformer(FeaturesEncoding),
        FeatureScaler,
        LGBMClassifier(**params)
    )
    
    optuna_score = cross_val_score(optuna_model, X, y, scoring='accuracy', cv=SKF)
    
    return np.mean(optuna_score)

In [None]:
lgbm_study = optuna.create_study(direction = 'maximize')
lgbm_study.optimize(lgbm_objective,n_trials=TRIALS, n_jobs=-1, show_progress_bar=True)
print("")
print(f'scores : {lgbm_study.best_value}, params : {lgbm_study.best_params} ')
with open('json/lgbm.json', 'w') as json_file:
    json.dump(lgbm_study.best_params, json_file, indent=4)

## CatBoost

In [None]:
params_cat ={    
    'thread_count': 4,
    'eval_metric': 'AUC',
    'loss_function': 'MultiClass',
    'random_seed': SEED,
    'verbose': False,
    'cat_features' : [9,10,11,12,13,14,15,16]
    
}
def bmi(df):
    data = df.copy()
    data['BMI'] = (data['Weight']/data['Height']**2)
    return data

def cat_objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'iterations': trial.suggest_int('iterations', 50, 300),
        'border_count': trial.suggest_int('border_count', 32, 255),
        **params_cat
    }
    
    optuna_model = make_pipeline(
        FunctionTransformer(bmi),
        FeatureScaler,
        CatBoostClassifier(**params)
    )
    
    optuna_score = cross_val_score(optuna_model, X, y, scoring='accuracy', cv=SKF)
    
    return np.mean(optuna_score)

In [None]:
cat_study = optuna.create_study(direction = 'maximize')
cat_study.optimize(cat_objective,n_trials=TRIALS, n_jobs=-1, show_progress_bar=True)
print("")
print(f'scores : {cat_study.best_value}, params : {cat_study.best_params}')
with open('json/cat.json', 'w') as json_file:
    json.dump(cat_study.best_params, json_file, indent=4)

## RF

In [None]:
params_rf = {
        'random_state' : SEED,
}
def rf_objective(trial):
    liste = [None] + list(range(2, 50))
    params = {
        'n_estimators' : trial.suggest_int('n_estimators', 10, 500),
        'criterion' : trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
        'max_depth' : trial.suggest_categorical('max_leaf_nodes',liste ),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 30),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 2, 30),
        'min_weight_fraction_leaf' : trial.suggest_float('min_weight_fraction_leaf', 0, .5),
        'max_features' : trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        'max_leaf_nodes' : trial.suggest_categorical('max_leaf_nodes', liste),
        'min_impurity_decrease' : trial.suggest_float('min_impurity_decrease', 1e-9, 1e-2, log = True),
        'bootstrap' : trial.suggest_categorical("bootstrap", [True, False]),
        'class_weight' : trial.suggest_categorical("class_weight", ["balanced", "balanced_subsample", None]),
        
        **params_rf

    }
    
    optuna_model = make_pipeline(
        FunctionTransformer(FeaturesEncoding),
        FeatureScaler,
        RandomForestClassifier(**params)
    )
    
    optuna_score = cross_val_score(optuna_model, X, y, scoring='accuracy', cv=SKF)
    
    return np.mean(optuna_score)

In [None]:
rf_study = optuna.create_study(direction = 'maximize')
rf_study.optimize(rf_objective,n_trials=TRIALS, n_jobs=-1, show_progress_bar=True)
print("")
print(f'scores : {rf_study.best_value}, params : {rf_study.best_params} ')
with open('json/rf.json', 'w') as json_file:
    json.dump(rf_study.best_params, json_file, indent=4)

## Summary

In [None]:
try: 
    del XGB, LGBM, CAT, RF
except:
    pass
from modules.model import XGB, LGBM, CAT, RF

In [None]:
for name, model in [('xgb ', XGB), ('lgbm', LGBM), ('cat ', CAT), ('rf  ', RF)] :
    scores = cross_val_score(model, X,y,scoring='accuracy',cv=SKF, n_jobs=-1)
    print(f'{name} - Mean score  : {np.mean(scores):.5f} ± {np.std(scores):.5f}')