In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import optuna

from utility import make_BMI

In [2]:
data = pd.read_csv('../data/train.csv', index_col='id')

data.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
data['BMI'] = make_BMI(data)
group_1 = ['Normal_Weight', 'Overweight_Level_I']
group_2 = ['Overweight_Level_II', 'Obesity_Type_I']
data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 'group_1' if x in group_1 else x)
data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 'group_2' if x in group_2 else x)

In [4]:
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(
        data.drop(['NObeyesdad'], axis=1),
        data['NObeyesdad'],
        test_size=0.20,
        shuffle=True
    )

    param = {
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100)
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    cat = ['CAEC', 'CALC', 'MTRANS', 'Gender','family_history_with_overweight','FAVC', 'SMOKE', 'SCC']
    
    clf = CatBoostClassifier(**param, iterations=1200, logging_level='Silent')
    train = Pool(X_train, y_train, cat_features=cat)
    valid = Pool(X_valid, y_valid, cat_features=cat)

    clf.fit(train, eval_set=valid, verbose=False, early_stopping_rounds=100)

    accuracy = accuracy_score(y_valid, clf.predict(X_valid))
    return accuracy

In [5]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

[I 2024-03-03 18:11:05,648] A new study created in memory with name: no-name-4275a77a-b9f1-494d-a5c1-daa290976d8f
[I 2024-03-03 18:11:11,473] Trial 0 finished with value: 0.9325626204238922 and parameters: {'colsample_bylevel': 0.037918961338759466, 'depth': 1, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.08609592768400551, 'min_data_in_leaf': 69, 'subsample': 0.3914819170886671}. Best is trial 0 with value: 0.9325626204238922.
[I 2024-03-03 18:12:23,895] Trial 1 finished with value: 0.9311175337186898 and parameters: {'colsample_bylevel': 0.0765957508148663, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.01667454639798476, 'min_data_in_leaf': 82, 'subsample': 0.5960643282810605}. Best is trial 0 with value: 0.9325626204238922.
[I 2024-03-03 18:15:12,124] Trial 2 finished with value: 0.9241329479768786 and parameters: {'colsample_bylevel': 0.09739980903922457, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_t

In [6]:
print('Best hyperparameters:', study.best_params)
print('Best Accuracy:', study.best_value)

Best hyperparameters: {'colsample_bylevel': 0.07756904546131699, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.03959067593716367, 'min_data_in_leaf': 61, 'subsample': 0.2397278976336545}
Best Accuracy: 0.9426782273603083


In [7]:
data = pd.read_csv('../data/train.csv', index_col='id')
data['BMI'] = make_BMI(data)

group_1 = ['Normal_Weight', 'Overweight_Level_I']
group_2 = ['Overweight_Level_II', 'Obesity_Type_I']

data1 = data[data['NObeyesdad'].isin(group_1)]
data2 = data[data['NObeyesdad'].isin(group_2)]

In [8]:
def objective1(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(
        data1.drop(['NObeyesdad'], axis=1),
        data1['NObeyesdad'],
        test_size=0.20,
        shuffle=True
    )

    param = {
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        #"used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    cat = ['CAEC', 'CALC', 'MTRANS', 'Gender','family_history_with_overweight','FAVC', 'SMOKE', 'SCC']
    
    clf = CatBoostClassifier(**param, iterations=1200, logging_level='Silent')
    train = Pool(X_train, y_train, cat_features=cat)
    valid = Pool(X_valid, y_valid, cat_features=cat)

    clf.fit(train, eval_set=valid, verbose=False, early_stopping_rounds=100)

    accuracy = accuracy_score(y_valid, clf.predict(X_valid))
    return accuracy

In [9]:
study = optuna.create_study(direction="maximize")
study.optimize(objective1, n_trials=50, timeout=600)

[I 2024-03-03 18:21:53,533] A new study created in memory with name: no-name-d3431733-b43c-4aa9-b450-8f8271c516e6
[I 2024-03-03 18:22:17,160] Trial 0 finished with value: 0.9137931034482759 and parameters: {'colsample_bylevel': 0.0984579103126481, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.0031328922055669033, 'min_data_in_leaf': 2, 'subsample': 0.2815735873803755}. Best is trial 0 with value: 0.9137931034482759.
[I 2024-03-03 18:22:37,395] Trial 1 finished with value: 0.8901996370235935 and parameters: {'colsample_bylevel': 0.045112406831238794, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'learning_rate': 0.0010434927693086936, 'min_data_in_leaf': 5, 'bagging_temperature': 7.023682712974014}. Best is trial 0 with value: 0.9137931034482759.
[I 2024-03-03 18:22:45,662] Trial 2 finished with value: 0.9292196007259528 and parameters: {'colsample_bylevel': 0.058400339418249715, 'depth': 3, 'boosting_type': 'Plain', '

In [10]:
print('Best hyperparameters:', study.best_params)
print('Best Accuracy:', study.best_value)

Best hyperparameters: {'colsample_bylevel': 0.04820386818974805, 'depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'learning_rate': 0.0451441303740223, 'min_data_in_leaf': 99}
Best Accuracy: 0.9437386569872959


In [11]:
def objective2(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(
        data2.drop(['NObeyesdad'], axis=1),
        data2['NObeyesdad'],
        test_size=0.20,
        shuffle=True
    )

    param = {
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        #"used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    cat = ['CAEC', 'CALC', 'MTRANS', 'Gender','family_history_with_overweight','FAVC', 'SMOKE', 'SCC']
    
    clf = CatBoostClassifier(**param, iterations=1200, logging_level='Silent')
    train = Pool(X_train, y_train, cat_features=cat)
    valid = Pool(X_valid, y_valid, cat_features=cat)

    clf.fit(train, eval_set=valid, verbose=False, early_stopping_rounds=100)

    accuracy = accuracy_score(y_valid, clf.predict(X_valid))
    return accuracy

In [12]:
study = optuna.create_study(direction="maximize")
study.optimize(objective2, n_trials=50, timeout=600)

[I 2024-03-03 18:32:05,022] A new study created in memory with name: no-name-c1162f71-941c-44b1-b7d9-d51063876535
[I 2024-03-03 18:32:19,396] Trial 0 finished with value: 0.8969641214351426 and parameters: {'colsample_bylevel': 0.025783766363779388, 'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'learning_rate': 0.0029083396692749066, 'min_data_in_leaf': 12, 'bagging_temperature': 4.319444004605371}. Best is trial 0 with value: 0.8969641214351426.
[I 2024-03-03 18:32:35,699] Trial 1 finished with value: 0.8666053357865685 and parameters: {'colsample_bylevel': 0.04914590802117809, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.0038984215294082607, 'min_data_in_leaf': 62, 'subsample': 0.5496822269941858}. Best is trial 0 with value: 0.8969641214351426.
[I 2024-03-03 18:32:48,782] Trial 2 finished with value: 0.890524379024839 and parameters: {'colsample_bylevel': 0.02987276590061378, 'depth': 6, 'boosting_type': 'Ordered'

In [13]:
print('Best hyperparameters:', study.best_params)
print('Best Accuracy:', study.best_value)

Best hyperparameters: {'colsample_bylevel': 0.07694033036630527, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'learning_rate': 0.05672521081338127, 'min_data_in_leaf': 16}
Best Accuracy: 0.9319227230910764


Обучим модели на всех данных с подобранными параметрами

In [14]:
data = pd.read_csv('../data/train.csv', index_col='id')

data['BMI'] = make_BMI(data)
group_1 = ['Normal_Weight', 'Overweight_Level_I']
group_2 = ['Overweight_Level_II', 'Obesity_Type_I']

data1 = data[data['NObeyesdad'].isin(group_1)]
data2 = data[data['NObeyesdad'].isin(group_2)]

data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 'group_1' if x in group_1 else x)
data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 'group_2' if x in group_2 else x)

In [15]:
X, y = data.drop(['NObeyesdad'], axis=1), data['NObeyesdad']
X1, y1 = data1.drop(['NObeyesdad'], axis=1), data1['NObeyesdad']
X2, y2 = data2.drop(['NObeyesdad'], axis=1), data2['NObeyesdad']
cat = ['CAEC', 'CALC', 'MTRANS', 'Gender','family_history_with_overweight','FAVC', 'SMOKE', 'SCC']

param = {'colsample_bylevel': 0.05544803743067026, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'learning_rate': 0.026603996554542788, 'min_data_in_leaf': 99}

clf = CatBoostClassifier(**param, iterations=1200, logging_level='Silent')
train = Pool(X, y, cat_features=cat)

clf.fit(train, verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2268d124100>

In [16]:
param = {'colsample_bylevel': 0.09274856703825257, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'learning_rate': 0.03441999288614816, 'min_data_in_leaf': 75}

clf1 = CatBoostClassifier(**param, iterations=1200, logging_level='Silent')
train = Pool(X1, y1, cat_features=cat)

clf1.fit(train, verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2268d0bfb80>

In [17]:
param = {'colsample_bylevel': 0.0710621910493227, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'learning_rate': 0.059084737806393804, 'min_data_in_leaf': 44}

clf2 = CatBoostClassifier(**param, iterations=1200, logging_level='Silent')
train = Pool(X2, y2, cat_features=cat)

clf2.fit(train, verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2268d0bf670>

In [18]:
test = pd.read_csv('../data/test.csv', index_col='id')

test['BMI'] = make_BMI(test)

test['pred'] = clf.predict(test)
index_group_1 = test[test['pred'] == 'group_1'].index
index_group_2 = test[test['pred'] == 'group_2'].index
test.loc[index_group_1, 'pred'] = clf1.predict(test.loc[index_group_1].drop('pred', axis=1))
test.loc[index_group_2, 'pred'] = clf2.predict(test.loc[index_group_2].drop('pred', axis=1))

pd.Series(test['pred'], index=test.index).to_csv('../preds/pred3.csv')

test['pred']

id
20758        Obesity_Type_II
20759     Overweight_Level_I
20760       Obesity_Type_III
20761         Obesity_Type_I
20762       Obesity_Type_III
                ...         
34593    Overweight_Level_II
34594          Normal_Weight
34595    Insufficient_Weight
34596          Normal_Weight
34597        Obesity_Type_II
Name: pred, Length: 13840, dtype: object

Благодаря файнтюнингу точность увеличилась на 1 процент.

Score: 0.89342

Необходимо более точно посмотреть на объекты, на которых модель ошибается.
Вероятно, стот добавить данные из оригинального датасета, и, возможно улучшит качество приведение датасета к формату оригинальных значений.

*Далее, исследовав данные оригинального датасета было решено оставить количественные данные как есть, в связи с тем, что и в оригинальных данных присутствуют такие же искажения.*