In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import optuna

from utility import make_BMI

In [7]:
data = pd.concat(
    [pd.read_csv('../data/train.csv', index_col='id'),
     pd.read_csv('../data/ObesityDataSet.csv')]
)
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [8]:
data['BMI'] = make_BMI(data)

In [9]:
data, target = data.drop('NObeyesdad', axis=1), data['NObeyesdad']

numerical = data.dtypes[data.dtypes == 'float64'].index
categorical = data.dtypes[data.dtypes == 'object'].index

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    target,
                                                    test_size=0.20,
                                                    random_state=3,
                                                    shuffle=True)

clf = CatBoostClassifier()
clf.fit(X_train, y_train, cat_features=list(categorical), eval_set=(X_test, y_test), silent=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2af21dd24a0>

![График обучения](https://s49klg.storage.yandex.net/rdisk/0cdf49dc775239bd71ea36fda142c55cba0a81ed7eebe3921d924369f242cd66/65e4d647/LyPBJX7DrmJ2vc9Oq453nZeVLXHTOlTefLLhSbJ6E5pHPcVol2ZdDYSssDdRq9Zpwl93l9M87l3mwtp9IboChQ==?uid=285904750&filename=poly1.png&disposition=inline&hash=&limit=0&content_type=image%2Fpng&owner_uid=285904750&fsize=25761&hid=6eeaa39a00c4952facf53948b5fff04d&media_type=image&tknv=v2&etag=20017e94dce1534d633f2735d4449f2c&rtoken=aG2Vx09uyPNz&force_default=yes&ycrid=na-394d82032f572a0c8b37e7da05a8f033-downloader8e&ts=612c7069cdfc0&s=b5aee07c3d7fed21a207faffe8db753e9d6fdb0df4899ce543d76da84353942e&pb=U2FsdGVkX19CsK5G8ZyThnAAOLxBXaew5fGcrtLIJ8CWMdcTMefjU4Hcw2LHvqOpohDOPq7qPUyAVkCHDftlQeVO--X7zJn8aFiXoRA_6VI)

In [11]:
clf.score(X_test, y_test)

0.9142982072584172

In [12]:
pd.Series(clf.get_feature_importance(), index=data.columns).sort_values(ascending=False)

BMI                               24.787682
Weight                            19.694717
Gender                             8.503540
FCVC                               6.886816
Age                                6.793040
Height                             4.956285
FAF                                4.647806
CALC                               4.545625
TUE                                3.841696
NCP                                3.390900
CH2O                               3.215668
CAEC                               3.038721
MTRANS                             2.900810
family_history_with_overweight     1.484375
FAVC                               1.005225
SCC                                0.287928
SMOKE                              0.019165
dtype: float64

In [13]:
poly = PolynomialFeatures(degree=3, include_bias=False)
data_poly = poly.fit_transform(data[numerical])
data_poly = pd.concat([pd.DataFrame(data_poly, columns=poly.get_feature_names_out()),
                       data[categorical].reset_index().drop('index', axis=1)], axis=1)

X_train, X_test, y_train, y_test = train_test_split(data_poly,
                                                    target,
                                                    test_size=0.20,
                                                    random_state=3,
                                                    shuffle=True)

clf = CatBoostClassifier()
clf.fit(X_train, y_train, cat_features=list(categorical), eval_set=(X_test, y_test), silent=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2af1e0a8250>

![График обучения](https://s124vlx.storage.yandex.net/rdisk/1225faf086c9e347fc29a2e5106fe88d2def2dea20844e907271e06fcac1a43c/65e4d66c/LyPBJX7DrmJ2vc9Oq453nXgwsRPcm0Ja0xbA3PtwWw1A9AVDL_0XNT9VdHtrrY5QoKHbx9sID3QqZBshi_tHGw==?uid=285904750&filename=poly2.png&disposition=inline&hash=&limit=0&content_type=image%2Fpng&owner_uid=285904750&fsize=26238&hid=5079cf045b354ba31d60a9ba2139add0&media_type=image&tknv=v2&etag=3f3bf09700183dd9c22f1c9bb17b8b22&rtoken=VnvX5eyjWXKV&force_default=yes&ycrid=na-5a1b768e7423509830a40e3b70949b1a-downloader22h&ts=612c708d17300&s=08fe305af20cc58c8475241c8f3e6ebe2c789d808dc461bae5d5a564422fbf5c&pb=U2FsdGVkX180mjDpIhan4F9VRP4rbUgSIEprZha0Y7ZpRPqrNQIj224bxKhW6YR313tsJpGB_Vhb0eHPtNQ7YQck6BtR5W90Gq-nGAg7kwQ)

In [14]:
clf.score(X_test, y_test)

0.9132050721469174

In [15]:
pd.Series(clf.get_feature_importance(), index=data_poly.columns).sort_values(ascending=False).head(20)

FCVC^2 BMI                        5.951403
BMI                               5.288599
BMI^3                             4.470819
BMI^2                             3.958332
Gender                            3.866886
Age Weight^2                      3.804339
Weight BMI^2                      3.650678
CALC                              2.607524
FCVC NCP BMI                      2.211375
Height BMI^2                      2.191078
CAEC                              1.934735
MTRANS                            1.780727
Height Weight BMI                 1.518303
Height^2 BMI                      1.246001
Height^2 Weight                   1.172016
Weight^3                          1.139362
family_history_with_overweight    1.108562
Weight^2                          1.050118
Height Weight^2                   1.028610
FCVC BMI^2                        0.952382
dtype: float64

In [16]:
new_features = [
    'FCVC^2 BMI',
    'BMI^3',
    'BMI^2',
    'Age Weight^2',
    'Weight BMI^2',
    'FCVC NCP BMI',
    'Height BMI^2', 
]

data1 = pd.concat([data.reset_index().drop('index', axis=1), data_poly[new_features]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(data1,
                                                    target,
                                                    test_size=0.20,
                                                    random_state=3,
                                                    shuffle=True)

clf = CatBoostClassifier()
clf.fit(X_train, y_train, cat_features=list(categorical), eval_set=(X_test, y_test), silent=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x2af1e9b24d0>

![График обучения](https://s370vlx.storage.yandex.net/rdisk/cc792c9c18909e4b15a8bfe412c5137a0945bac94de422a32da13f5840acebd9/65e4d69b/LyPBJX7DrmJ2vc9Oq453neqWXhwRvypFatyDn7SY09unLLCXEtb8wtOgzABTYJqkKWOCH4ExoiCne6sEPmlQlw==?uid=285904750&filename=poly3.png&disposition=inline&hash=&limit=0&content_type=image%2Fpng&owner_uid=285904750&fsize=26598&hid=1a433e430a2030c0b9fc643f3d12030c&media_type=image&tknv=v2&etag=b9ff9bda0f5a88c2c33df4cd13d92bd0&rtoken=cDFQ76NQ0B95&force_default=yes&ycrid=na-9128f93ba65e8c6105fe2e3692ec8a44-downloader8e&ts=612c70b9e9cc0&s=e2f84040b0f4814edfe912fa13c3cba242c322f74e2b7c749b0427fbba3f928e&pb=U2FsdGVkX18f19G1wubV-_f6rKVzyWJ7LZYQMoGkyqVg2adVk1dDHKz4luUEOwz6HorplskL3JN-EtnZJWTvxpgP3x1IyXoWAuZgdDvLwE8)

In [17]:
clf.score(X_test, y_test)

0.9138609532138172

In [18]:
pd.Series(clf.get_feature_importance(), index=data1.columns).sort_values(ascending=False)

Height BMI^2                      8.561711
FCVC NCP BMI                      7.471396
Weight                            7.205958
BMI                               6.883801
BMI^3                             5.802823
FCVC^2 BMI                        5.790555
Gender                            5.560532
BMI^2                             5.417147
Age                               5.302355
Age Weight^2                      4.781932
Height                            4.516521
FAF                               4.348877
Weight BMI^2                      4.325519
CALC                              4.061414
TUE                               3.587946
CH2O                              3.408009
MTRANS                            2.918503
FCVC                              2.832646
CAEC                              2.429794
NCP                               2.150735
family_history_with_overweight    1.402920
FAVC                              0.936164
SCC                               0.284581
SMOKE      

In [19]:
preds = X_test
preds['predict'] = clf.predict(X_test)
preds['true'] = y_test.values

mistakes = preds[preds['predict'] != preds['true']][['predict', 'true']]

mistakes['true'].value_counts()

Overweight_Level_II    101
Overweight_Level_I      86
Obesity_Type_I          78
Normal_Weight           77
Insufficient_Weight     30
Obesity_Type_II         18
Obesity_Type_III         4
Name: true, dtype: int64

In [20]:
mistakes.groupby('true')['predict'].value_counts().sort_values(ascending=False)

true                 predict            
Overweight_Level_II  Overweight_Level_I     51
Normal_Weight        Overweight_Level_I     39
Overweight_Level_I   Normal_Weight          36
                     Overweight_Level_II    34
Overweight_Level_II  Obesity_Type_I         33
Obesity_Type_I       Overweight_Level_II    33
Normal_Weight        Insufficient_Weight    32
Obesity_Type_I       Obesity_Type_II        28
Insufficient_Weight  Normal_Weight          25
Obesity_Type_I       Overweight_Level_I     14
Overweight_Level_II  Normal_Weight          13
Obesity_Type_II      Obesity_Type_I         12
Overweight_Level_I   Obesity_Type_I         11
Normal_Weight        Overweight_Level_II     6
Overweight_Level_I   Insufficient_Weight     5
Obesity_Type_II      Overweight_Level_II     4
Overweight_Level_II  Obesity_Type_II         4
Insufficient_Weight  Overweight_Level_I      3
Obesity_Type_III     Obesity_Type_I          2
                     Overweight_Level_I      2
Obesity_Type_I     