In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from tqdm import tqdm

from utility import make_BMI

In [2]:
data = pd.read_csv('../data/train.csv', index_col='id')

data.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
data['BMI'] = make_BMI(data)
data = data.drop(['Height', 'Weight'], axis=1)
data = pd.get_dummies(data, columns=['Gender', 
                                     'family_history_with_overweight',
                                     'FAVC',
                                     'SMOKE',
                                     'SCC'], drop_first=True)
cat = ['CAEC', 'CALC', 'MTRANS']

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(data.drop(['NObeyesdad'], axis=1),
                                                      data['NObeyesdad'],
                                                      test_size=0.20,
                                                      random_state=3,
                                                      shuffle=True)

In [5]:
target = list(y_train.unique())
target

['Insufficient_Weight',
 'Overweight_Level_I',
 'Obesity_Type_I',
 'Overweight_Level_II',
 'Normal_Weight',
 'Obesity_Type_II',
 'Obesity_Type_III']

Обучим отдельные модели для каждого класса. После этого будем выбирать тот класс, в котором модель уверена больше.

In [6]:
scores, models = {}, {}

for t in tqdm(target):
    X_train, X_valid, y_train, y_valid = train_test_split(data.drop(['NObeyesdad'], axis=1),
                                                      data['NObeyesdad'],
                                                      test_size=0.20,
                                                      random_state=3,
                                                      shuffle=True)
    y_train = y_train.apply(lambda x: x if x == t else 'other')
    y_valid = y_valid.apply(lambda x: x if x == t else 'other')
    clf = CatBoostClassifier()
    clf.fit(X_train, y_train, cat_features=cat, silent=True)
    models[t] = clf
    scores[t] = clf.score(X_valid, y_valid)
    
pd.Series(scores).sort_values()

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [03:18<00:00, 28.31s/it]


Overweight_Level_II    0.945087
Overweight_Level_I     0.947977
Obesity_Type_I         0.957129
Normal_Weight          0.962187
Obesity_Type_II        0.982418
Insufficient_Weight    0.983141
Obesity_Type_III       0.999037
dtype: float64

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(data.drop(['NObeyesdad'], axis=1),
                                                  data['NObeyesdad'],
                                                  test_size=0.20,
                                                  random_state=3,
                                                  shuffle=True)

predictions = pd.DataFrame(columns=target)
for t, model in models.items():
    predictions[t] = model.predict_proba(X_valid)[:, 0]

In [8]:
correct, false = 0, 0
for i in predictions.index:
    if predictions.iloc[i].sort_values(ascending=False).index[0] == y_valid.iloc[i]:
        correct +=1
    else:
        false += 1
        
correct / (correct + false)

0.8877649325626205

0.884 - базовая точность. Использование отдельной модели для каждого класса позволяет повысить точность примерно на 0.004, что не кажется сильно значимым.
Можно попробовать обучить отдельную модель, предсказывать на таких предсказаниях и посмотреть результат.

In [9]:
clf = CatBoostClassifier()
clf.fit(predictions, y_valid, silent=True)

<catboost.core.CatBoostClassifier at 0x292ce517af0>

In [10]:
test = pd.read_csv('../data/test.csv', index_col='id')

test['BMI'] = make_BMI(test)
test = test.drop(['Height', 'Weight'], axis=1)
test = pd.get_dummies(test, columns=['Gender', 
                                     'family_history_with_overweight',
                                     'FAVC',
                                     'SMOKE',
                                     'SCC'], drop_first=True)


predictions = pd.DataFrame(columns=target)
for t, model in models.items():
    predictions[t] = model.predict_proba(test)[:, 0]
    
pred = clf.predict(predictions)

pd.Series(pred[:, 0], index=test.index).to_csv('../preds/pred1.csv')

У предсказания на основе предсказаний качества хуже. Public score: 0.87789

Попробуем сформировать рядом стоящие (учитывая что таргет ранговый) объекты в колонках где была наименьшая точность, чтобы обучить отдельные модели по различению этих друх таргетов.

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(data.drop(['NObeyesdad'], axis=1),
                                                      data['NObeyesdad'],
                                                      test_size=0.20,
                                                      random_state=3,
                                                      shuffle=True)
def transform(x):
    group_1 = ['Normal_Weight', 'Overweight_Level_I']
    group_2 = ['Overweight_Level_II', 'Obesity_Type_I']
    if x in group_1: return 'group_1'
    if x in group_2: return 'group_2'
    return x

y_train = y_train.apply(transform)
y_valid = y_valid.apply(transform)

clf1 = CatBoostClassifier()
clf1.fit(X_train, y_train, cat_features=cat, silent=True)
clf1.score(X_valid, y_valid)

0.9236512524084779

In [12]:
g1 = data[data['NObeyesdad'].isin(['Normal_Weight', 'Overweight_Level_I'])]
g2 = data[data['NObeyesdad'].isin(['Overweight_Level_II', 'Obesity_Type_I'])]

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(g1.drop(['NObeyesdad'], axis=1),
                                                      g1['NObeyesdad'],
                                                      test_size=0.20,
                                                      random_state=3,
                                                      shuffle=True)

clf2_1 = CatBoostClassifier()
clf2_1.fit(X_train, y_train, cat_features=cat, silent=True)
clf2_1.score(X_valid, y_valid)

0.9219600725952813

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(g2.drop(['NObeyesdad'], axis=1),
                                                      g2['NObeyesdad'],
                                                      test_size=0.20,
                                                      random_state=3,
                                                      shuffle=True)

clf2_2 = CatBoostClassifier()
clf2_2.fit(X_train, y_train, cat_features=cat, silent=True)
clf2_2.score(X_valid, y_valid)

0.9116835326586936

In [15]:
test = pd.read_csv('../data/test.csv', index_col='id')

test['BMI'] = make_BMI(test)
test = test.drop(['Height', 'Weight'], axis=1)
test = pd.get_dummies(test, columns=['Gender', 
                                     'family_history_with_overweight',
                                     'FAVC',
                                     'SMOKE',
                                     'SCC'], drop_first=True)


test['pred'] = clf1.predict(test)
index_group_1 = test[test['pred'] == 'group_1'].index
index_group_2 = test[test['pred'] == 'group_2'].index
test.loc[index_group_1, 'pred'] = clf2_1.predict(test.loc[index_group_1].drop('pred', axis=1))
test.loc[index_group_2, 'pred'] = clf2_2.predict(test.loc[index_group_2].drop('pred', axis=1))

pd.Series(test['pred'], index=test.index).to_csv('../preds/pred2.csv')

test['pred']

id
20758        Obesity_Type_II
20759     Overweight_Level_I
20760       Obesity_Type_III
20761         Obesity_Type_I
20762       Obesity_Type_III
                ...         
34593    Overweight_Level_II
34594          Normal_Weight
34595    Insufficient_Weight
34596          Normal_Weight
34597        Obesity_Type_II
Name: pred, Length: 13840, dtype: object

Результат не сильно выше базовой модели Score: 0.88403