In [30]:
import pandas as pd
import numpy as np

In [31]:
train = pd.read_csv('/content/sample_data/train.csv')

In [32]:
train = train.drop(columns=['id']).dropna()

In [33]:
train['Age'] = train['Age'].apply(lambda x: round(x))

In [34]:
train['BMI'] = train['Weight'] / (train['Height'])**2

In [35]:
columns= ['Age', 'FCVC', 'NCP']
for col in columns:
    train[col] = train[col].apply(lambda x: round(x))

In [36]:
age_bins = [0, 12, 19, 35, 60, 100]
age_labels = ['Child', 'Teen', 'Young Adult', 'Middle-Aged Adult', 'Senior']

train['Age'] = pd.cut(train['Age'], bins=age_bins, labels=age_labels, right=False)
train['Age'] = train['Age'].astype(object)

In [37]:
train.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,Male,Young Adult,1.699998,81.66995,yes,yes,2,3,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II,28.259565
1,Female,Teen,1.56,57.0,yes,yes,2,3,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight,23.422091
2,Female,Teen,1.71146,50.165754,yes,yes,2,1,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight,17.126706
3,Female,Young Adult,1.71073,131.274851,yes,yes,3,3,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III,44.855798
4,Male,Young Adult,1.914186,93.798055,yes,yes,3,2,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II,25.599151


In [38]:
train.dtypes

Unnamed: 0,0
Gender,object
Age,object
Height,float64
Weight,float64
family_history_with_overweight,object
FAVC,object
FCVC,int64
NCP,int64
CAEC,object
SMOKE,object


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer

In [40]:
St = StandardScaler()
for col in train:
    if train[col].dtype == 'int64' or train[col].dtype == 'float64':
        train[col] = St.fit_transform(train[col].values.reshape(-1, 1))

In [41]:
le = LabelEncoder()
for col in train.columns:
    if train[col].dtype == 'object':
        train[col] = le.fit_transform(train[col])

In [42]:
x = train.drop(columns=['NObeyesdad'])
y = train['NObeyesdad']

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# **Desicion Tree**

In [44]:
dt = DecisionTreeClassifier()
param_DT = {
    'criterion': ['gini', 'entropy'],  # Splitting criteria
    'max_depth': [None, 10, 20, 30, 40],  # Maximum depth of the tree
    'min_samples_split': [10, 20, 30],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 5, 10],  # Minimum samples required to be at a leaf node
    'max_features': [None, 'sqrt', 'log2'],  # Number of features to consider for the best split
    'splitter': ['best', 'random']  # Strategy used to split at each node
}

dt_classifier = DecisionTreeClassifier(random_state=42)

dt_cv = RandomizedSearchCV(estimator=dt,
                           param_distributions=param_DT,
                           refit=True,
                           scoring='accuracy',
                           n_iter=30,
                           cv=3,
                           verbose=2,
                           random_state=42,
                           n_jobs=-1)

dt_cv.fit(x_train, y_train)

best_params = dt_cv.best_params_
print(f"Best Parameters: {best_params}")

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Parameters: {'splitter': 'best', 'min_samples_split': 30, 'min_samples_leaf': 5, 'max_features': None, 'max_depth': None, 'criterion': 'gini'}


In [45]:
best_dt = DecisionTreeClassifier(**best_params, random_state=42)
best_dt.fit(x_train, y_train)

In [46]:
y_pred12 = dt_cv.predict(x_train)

accuracy = accuracy_score(y_train, y_pred12)
print(f"Accuracy on the test set: {accuracy}")

cm = confusion_matrix(y_train, y_pred12)
print(f"Confusion Matrix:\n{cm}")

report = classification_report(y_train, y_pred12)
print(f"Classification Report:\n{report}")

Accuracy on the test set: 0.9028062146212212
Confusion Matrix:
[[1867  121    0    1    0    6    4]
 [ 136 2151    6    0    0  139   24]
 [   1    2 2051   98   19   42  154]
 [   0    0   85 2492    1    1   12]
 [   1    0    3    3 3232    3    0]
 [  19  158   39    0    0 1527  200]
 [   2   33  133    8    0  160 1672]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      1999
           1       0.87      0.88      0.87      2456
           2       0.89      0.87      0.88      2367
           3       0.96      0.96      0.96      2591
           4       0.99      1.00      1.00      3242
           5       0.81      0.79      0.80      1943
           6       0.81      0.83      0.82      2008

    accuracy                           0.90     16606
   macro avg       0.89      0.89      0.89     16606
weighted avg       0.90      0.90      0.90     16606



In [47]:
y_pred1 = dt_cv.predict(x_test)

accuracy = accuracy_score(y_test, y_pred1)
print(f"Accuracy on the test set: {accuracy}")

cm = confusion_matrix(y_test, y_pred1)
print(f"Confusion Matrix:\n{cm}")

report = classification_report(y_test, y_pred1)
print(f"Classification Report:\n{report}")

Accuracy on the test set: 0.8694605009633911
Confusion Matrix:
[[472  50   0   0   0   1   1]
 [ 41 531   0   0   0  46   8]
 [  1   2 434  32   4  18  52]
 [  0   0  33 620   1   0   3]
 [  0   0   1   1 801   0   1]
 [  2  44   9   0   0 361  68]
 [  1  11  38   5   1  67 391]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.91       524
           1       0.83      0.85      0.84       626
           2       0.84      0.80      0.82       543
           3       0.94      0.94      0.94       657
           4       0.99      1.00      0.99       804
           5       0.73      0.75      0.74       484
           6       0.75      0.76      0.75       514

    accuracy                           0.87      4152
   macro avg       0.86      0.86      0.86      4152
weighted avg       0.87      0.87      0.87      4152



In [48]:
param_DT1 = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt'],
    'splitter': ['best', 'random']
}


dt_grid = GridSearchCV(estimator=dt,
                       param_grid=param_DT1,
                       scoring='accuracy',
                       cv=2,
                       refit=True,
                       verbose=2,
                       n_jobs=-1)

dt_grid.fit(x_train, y_train)

best_params1 = dt_grid.best_params_
print(f"Best Parameters: {best_params1}")

best_dt = dt_grid.best_estimator_

Fitting 2 folds for each of 144 candidates, totalling 288 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 30, 'splitter': 'best'}


In [49]:
best_dt = DecisionTreeClassifier(**best_params1, random_state=42)
best_dt.fit(x_train, y_train)

In [50]:
y_pred22 = dt_grid.predict(x_train)

accuracy = accuracy_score(y_train, y_pred22)
print(f"Accuracy on the test set: {accuracy}")

cm = confusion_matrix(y_train, y_pred22)
print(f"Confusion Matrix:\n{cm}")

report = classification_report(y_train, y_pred22)
print(f"Classification Report:\n{report}")

Accuracy on the test set: 0.8996748163314464
Confusion Matrix:
[[1899   89    1    1    0    7    2]
 [ 182 2125    4    0    0  120   25]
 [   1    3 2065   95   11   35  157]
 [   0    0   80 2498    0    3   10]
 [   1    0    2    3 3233    3    0]
 [  18  194   41    0    0 1474  216]
 [   0   36  145    5    0  176 1646]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.93      1999
           1       0.87      0.87      0.87      2456
           2       0.88      0.87      0.88      2367
           3       0.96      0.96      0.96      2591
           4       1.00      1.00      1.00      3242
           5       0.81      0.76      0.78      1943
           6       0.80      0.82      0.81      2008

    accuracy                           0.90     16606
   macro avg       0.89      0.89      0.89     16606
weighted avg       0.90      0.90      0.90     16606



In [51]:
y_pred2 = dt_grid.predict(x_test)

accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy on the test set: {accuracy}")

cm = confusion_matrix(y_test, y_pred2)
print(f"Confusion Matrix:\n{cm}")

report = classification_report(y_test, y_pred2)
print(f"Classification Report:\n{report}")

Accuracy on the test set: 0.8788535645472062
Confusion Matrix:
[[485  36   0   0   0   2   1]
 [ 48 521   0   0   0  47  10]
 [  2   1 461  17   1  12  49]
 [  0   0  24 630   2   0   1]
 [  0   0   1   1 802   0   0]
 [  1  53   9   0   0 354  67]
 [  0   8  38   2   0  70 396]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.92       524
           1       0.84      0.83      0.84       626
           2       0.86      0.85      0.86       543
           3       0.97      0.96      0.96       657
           4       1.00      1.00      1.00       804
           5       0.73      0.73      0.73       484
           6       0.76      0.77      0.76       514

    accuracy                           0.88      4152
   macro avg       0.87      0.87      0.87      4152
weighted avg       0.88      0.88      0.88      4152



# **Light GBM**

In [54]:
lgb_classifier = LGBMClassifier(random_state=42)

param_LGBM = {
    'objective': ['multiclass'],
    'metric': ['multi_logloss'],
    'boosting_type': ['gbdt'],
    'learning_rate': [0.1],
    'n_estimators': [100, 200],
    'num_leaves': [31, 63],
    'max_depth': [5, 10],
    'min_data_in_leaf': [20, 50],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7],
}

random = RandomizedSearchCV(estimator=lgb_classifier,
                            param_distributions=param_LGBM,
                            n_iter=30,
                            cv=3,
                            refit=True,
                            scoring='accuracy',
                            verbose=1,
                            random_state=42)

random.fit(x_train, y_train)
print("Best hyperparameters found by RandomizedSearchCV:", random.best_params_)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1564
[LightGBM] [Info] Number of data points in the train set: 11070, number of used features: 17
[LightGBM] [Info] Start training from score -2.117557
[LightGBM] [Info] Start training from score -1.910763
[LightGBM] [Info] Start training from score -1.948081
[LightGBM] [Info] Start training from score -1.857274
[LightGBM] [Info] Start training from score -1.633668
[LightGBM] [Info] Start training from score -2.145728
[LightGBM] [Info] Start training from score -2.113063
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1564
[LightGB

In [56]:
y_pred3 = random.predict(x_train)
print("Accuracy:", accuracy_score(y_train, y_pred3))
print("\nClassification Report:")
print(classification_report(y_train, y_pred3))

Accuracy: 0.9848247621341684

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1999
           1       0.98      0.98      0.98      2456
           2       0.99      0.99      0.99      2367
           3       1.00      1.00      1.00      2591
           4       1.00      1.00      1.00      3242
           5       0.97      0.94      0.96      1943
           6       0.96      0.97      0.97      2008

    accuracy                           0.98     16606
   macro avg       0.98      0.98      0.98     16606
weighted avg       0.98      0.98      0.98     16606



In [58]:
y_pred32 = random.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred32))
print("\nClassification Report:")
print(classification_report(y_test, y_pred32))

Accuracy: 0.9010115606936416

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       524
           1       0.87      0.89      0.88       626
           2       0.86      0.87      0.86       543
           3       0.98      0.96      0.97       657
           4       1.00      1.00      1.00       804
           5       0.79      0.79      0.79       484
           6       0.80      0.81      0.80       514

    accuracy                           0.90      4152
   macro avg       0.89      0.89      0.89      4152
weighted avg       0.90      0.90      0.90      4152



# **XGBoost**

In [59]:
xgb = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

param_dist = {
    'n_estimators': np.arange(100, 500, 100),
    'max_depth': np.arange(5, 15, 2),
    'learning_rate': np.linspace(0.01, 0.2, 5),
    'subsample': np.linspace(0.7, 1.0, 3),
    'colsample_bytree': np.linspace(0.7, 1.0, 3),
    'min_child_weight': np.arange(1, 6),
    'gamma': np.linspace(0, 0.3, 5)
}

random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=30, cv=5, scoring='accuracy', verbose=1)
random_search.fit(x_train, y_train)

print("Best Parameters:", random_search.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'subsample': 0.7, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.0575, 'gamma': 0.075, 'colsample_bytree': 0.7}


# **Random Forest**

In [None]:
RF = RandomForestClassifier()

n_estimators = [int(x) for x in np.linspace(start=100, stop=500, num=5)]
max_depth = [10, 20, 30]
min_samples_split = [None,10, 20, 30]
min_samples_leaf = [2, 4, 6]
max_features = ['sqrt', 'log2', None]
bootstrap = [True,False]

random_CV = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'max_features': max_features,
    'bootstrap': bootstrap
}


random_search = RandomizedSearchCV(
    estimator=RF,
    param_distributions=random_CV,
    n_iter=30,
    scoring='accuracy',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(x_train, y_train)

best_params = random_search.best_params_
print("Best Parameters:", best_params)

In [None]:
y_pred1 = random_search.predict(x_train)
print("Accuracy:", accuracy_score(y_train, y_pred1))
print("\nClassification Report:")
print(classification_report(y_train, y_pred1))

In [None]:
y_pred12 = random_search.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred12))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
print("Confusion Matrix:", confusion_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:

param_dist = {
    'n_estimators': np.arange(100, 1001, 100),  # Number of trees in the forest
    'max_depth': [None] + list(np.arange(5, 21, 5)),  # Maximum depth of the tree
    'min_samples_split': np.arange(2, 11, 2),  # Minimum samples required to split a node
    'min_samples_leaf': np.arange(1, 11, 2),  # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2', None],  # Number of features to consider when looking for the best split
    'bootstrap': bootstrap  # Whether bootstrap samples are used when building trees
}