# 1. Importing libraries

In [158]:
import pandas as pd
from category_encoders import HashingEncoder 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier

import optuna

from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostClassifier

# 2. Reading data from csv files.

In [159]:
training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [160]:
training_data.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


# 3. Preprocessing data with HashEncoder and Standard Scaler.

In [161]:
def preprocessing(train: pd.DataFrame, test: pd.DataFrame) -> pd.DataFrame:
    '''
    Function for preparing data for classification. 
    '''
    
    # clearing data of duplicates and NaNs.
    train = train.dropna()
    train = train.drop_duplicates().reset_index(drop=True)

    # splitting data to features and target.
    train, target = train.drop(["id", "NObeyesdad"], axis=1), train["NObeyesdad"]

    le = LabelEncoder()
    target = le.fit_transform(target)

    # split df on numerical and categorical features.
    num_cols = list(set(train.select_dtypes("number").columns))
    cat_cols = list(set(train.select_dtypes("object").columns))

    # encoding categorical columns with WOE.
    hash_encoder = HashingEncoder(cols=cat_cols) 
    train[cat_cols] = hash_encoder.fit_transform(train[cat_cols], target)
    test[cat_cols] = hash_encoder.transform(test[cat_cols])

    # scaling numerical features.
    scaler = StandardScaler()
    train[num_cols] = scaler.fit_transform(train[num_cols])
    test[num_cols] = scaler.transform(test[num_cols])

    return train, target, test, le.classes_

In [162]:
train, target, test, classes = preprocessing(train=training_data, test=test_data)

In [163]:
train.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,0,0.105699,-0.002828,-0.235713,2,2,-0.836279,0.314684,0,0,1.206594,2,-1.171141,0.597438,1,1
1,0,-1.027052,-1.606291,-1.170931,4,0,-0.836279,0.338364,0,0,-0.048349,3,0.021775,0.636513,1,0
2,0,-1.027052,0.128451,-1.430012,4,1,-1.060332,-1.913423,0,0,-0.195644,2,-0.138022,1.755239,1,0
3,0,-0.507929,0.12009,1.64477,3,2,1.039171,0.338364,0,0,-0.584035,2,0.579896,0.271455,1,0
4,0,1.371197,2.450367,0.224054,2,2,0.438397,-1.119801,0,0,-0.081469,2,1.176486,0.523111,1,1


In [164]:
test.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,20758,0,0.537644,1.695675,1.24177,2,2,0.924049,0.338364,0,0,1.308584,2,-0.150721,-1.024344,1,1
1,20759,0,-0.49962,-1.148152,-0.829748,3,2,-0.836279,-2.497077,0,0,1.595165,2,0.021775,-1.024344,1,0
2,20760,0,0.379434,-0.651587,0.898933,3,2,1.039171,0.338364,0,0,0.973714,2,-1.171141,-0.608296,1,0
3,20761,0,-0.503267,-1.685011,0.598259,2,2,-0.836279,0.307045,0,0,1.244138,2,-1.057992,-1.024344,1,1
4,20762,0,0.379434,-0.834373,0.642469,3,2,1.039171,0.338364,0,0,1.025738,2,-1.171141,0.206466,1,0


# 4. Hyperparameter tuning for CatBoost with Optuna.

In [171]:
def objective(trial, data=train, target=target):
    '''
    Objective function for hyperparam-tuning.
    '''
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1500),
        'depth': trial.suggest_int('depth', 3, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'eval_metric': 'Accuracy',
        'random_seed': 42,
        'task_type': 'GPU',        
    }
    model = CatBoostClassifier(**param)  
    
    model.fit(train_x,train_y, eval_set=[(test_x,test_y)], early_stopping_rounds=200, verbose=False)
    
    preds = model.predict(test_x)
    
    f1 = f1_score(test_y, preds, average='weighted')
    
    return f1

In [172]:
# model hyperparams tuning.

study = optuna.create_study(direction='maximize', study_name='Catboost_study')
study.optimize(objective, n_trials=300)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-02-13 15:47:59,053] A new study created in memory with name: Catboost_study
[I 2024-02-13 15:52:46,634] Trial 0 finished with value: 0.8927525004727773 and parameters: {'iterations': 676, 'depth': 14, 'learning_rate': 0.010628444782045178}. Best is trial 0 with value: 0.8927525004727773.
[I 2024-02-13 15:53:00,875] Trial 1 finished with value: 0.9004790763899027 and parameters: {'iterations': 1440, 'depth': 6, 'learning_rate': 0.05299323004022556}. Best is trial 1 with value: 0.9004790763899027.
[I 2024-02-13 15:54:55,455] Trial 2 finished with value: 0.8915988824239701 and parameters: {'iterations': 1174, 'depth': 14, 'learning_rate': 0.08906597448038002}. Best is trial 1 with value: 0.9004790763899027.
[I 2024-02-13 15:55:25,719] Trial 3 finished with value: 0.8964756590375865 and parameters: {'iterations': 630, 'depth': 10, 'learning_rate': 0.011193803359304052}. Best is trial 1 with value: 0.9004790763899027.
[I 2024-02-13 15:55:36,609] Trial 4 finished with value: 0.897779

Number of finished trials: 300
Best trial: {'iterations': 1354, 'depth': 6, 'learning_rate': 0.0933244870266498}


# 5. Submission to csv.

In [174]:
# mapping from label to source values from train target.

classes = {0: 'Insufficient_Weight',
 1: 'Normal_Weight',
 2: 'Obesity_Type_I',
 3: 'Obesity_Type_II',
 4: 'Obesity_Type_III',
 5: 'Overweight_Level_I',
 6: 'Overweight_Level_II'}

In [179]:
def submission(model: object, test_data: pd.DataFrame, classes: dict):
    model.fit(train, target)
    predictions = model.predict(test_data.drop(columns=['id'], axis=1))
    submission = pd.DataFrame({'id': test_data['id'], 
                               'NObeyesdad': [classes[pred[0]] for pred in predictions]})
    submission.to_csv('submission.csv', index=False)

In [180]:
# Best model fitting and submitting.

cat = CatBoostClassifier(**study.best_trial.params)
submission(cat, test, classes)

0:	learn: 1.6379868	total: 30.9ms	remaining: 41.9s
1:	learn: 1.4559435	total: 56.5ms	remaining: 38.2s
2:	learn: 1.3190852	total: 83.2ms	remaining: 37.5s
3:	learn: 1.1976402	total: 111ms	remaining: 37.4s
4:	learn: 1.1111888	total: 138ms	remaining: 37.3s
5:	learn: 1.0322470	total: 164ms	remaining: 36.8s
6:	learn: 0.9682852	total: 188ms	remaining: 36.1s
7:	learn: 0.9102198	total: 210ms	remaining: 35.4s
8:	learn: 0.8579997	total: 230ms	remaining: 34.4s
9:	learn: 0.8159344	total: 250ms	remaining: 33.6s
10:	learn: 0.7789825	total: 276ms	remaining: 33.7s
11:	learn: 0.7456689	total: 388ms	remaining: 43.4s
12:	learn: 0.7152925	total: 415ms	remaining: 42.8s
13:	learn: 0.6869570	total: 451ms	remaining: 43.1s
14:	learn: 0.6628208	total: 472ms	remaining: 42.2s
15:	learn: 0.6395644	total: 495ms	remaining: 41.4s
16:	learn: 0.6170516	total: 520ms	remaining: 40.9s
17:	learn: 0.5982516	total: 540ms	remaining: 40s
18:	learn: 0.5802437	total: 561ms	remaining: 39.4s
19:	learn: 0.5650778	total: 586ms	remain

# 6. Please upvote my notebook if you like it! Thanks and best wishes for your future projects.