# Import

In [11]:
import pandas as pd
import numpy as np
import json

from modules.features_encoding import FeaturesEncoding

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, FunctionTransformer, LabelEncoder
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')


In [None]:
train = pd.read_csv('data/train.csv', index_col = 'id')

# Optuna

In [8]:
X = train.copy()

lb = LabelEncoder()
y = lb.fit_transform(X.pop())

SEED = 42
SPLITS = 5
TRIALS = 200
SKF = StratifiedKFold(n_splits = SPLITS, random_state = SEED, shuffle = True)

## XGB

In [9]:
params_xgb = {
        'objective':'multi:softmax',
        'booster':'gbtree',
        'random_state': SEED,
        'tree_method' : 'hist',
}
def xgb_objective(trial):

    params = {
        'max_depth' : trial.suggest_int('max_depth', 0, 30),
        'grow_policy': trial.suggest_categorical('grow_policy', ["depthwise", "lossguide"]),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log = True),
        'gamma' : trial.suggest_float('gamma', 1e-9, 1.0),
        'subsample': trial.suggest_float('subsample', 0.25, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.25, 1.0),
        'max_depth': trial.suggest_int('max_depth', 0, 24),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 30),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),

        **params_xgb

    }

    optuna_model = make_pipeline(
        FunctionTransformer(FeaturesEncoding),
        FeatureScaler,
        XGBClassifier(**params)
    )

    optuna_score = cross_val_score(optuna_model, X, y, scoring='accuracy', cv=SKF, n_jobs=-1)

    return np.mean(optuna_score)

In [10]:
if False :
    xgb_study = optuna.create_study(direction = 'maximize')
    xgb_study.optimize(xgb_objective,n_trials=TRIALS, n_jobs=-1, show_progress_bar=True)
    print("")
    print(f'scores : {xgb_study.best_value}, params : {xgb_study.best_params} ')
    with open('json/xgb.json', 'w') as json_file:
        json.dump(xgb_study.best_params, json_file, indent=4)

## LGBM

In [11]:
params_lgbm = {
    "objective": "multiclass",
    "metric": "multi_logloss",
    "boosting_type": "gbdt",
    "num_class": 7,
    'random_state': SEED
}
def lgbm_objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', .001, .1, log = True),
        'num_leaves': trial.suggest_int('num_leaves',10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),
        'subsample': trial.suggest_float('subsample', .5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_lambda' : trial.suggest_float('reg_lambda', 0.1, 20, log = True),
        'reg_alpha' : trial.suggest_float('reg_alpha', 0.1, 10, log = True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight' : trial.suggest_float('min_child_weight', .1, 15, log = True),
        **params_lgbm

    }

    optuna_model = make_pipeline(
        FunctionTransformer(FeaturesEncoding),
        FeatureScaler,
        LGBMClassifier(**params)
    )

    optuna_score = cross_val_score(optuna_model, X, y, scoring='accuracy', cv=SKF, n_jobs=-1)

    return np.mean(optuna_score)

In [12]:
if False :
    lgbm_study = optuna.create_study(direction = 'maximize')
    lgbm_study.optimize(lgbm_objective,n_trials=TRIALS, n_jobs=-1, show_progress_bar=True)
    print("")
    print(f'scores : {lgbm_study.best_value}, params : {lgbm_study.best_params} ')
    with open('json/lgbm.json', 'w') as json_file:
        json.dump(lgbm_study.best_params, json_file, indent=4)

## CatBoost

In [13]:
params_cat ={
    'thread_count': 4,
    'eval_metric': 'AUC',
    'loss_function': 'MultiClass',
    'random_seed': SEED,
    'verbose': False,
    'cat_features' : [8, 9,10,11,12,13,14,15]

}
def bmi(df):
    data = df.copy()
    data['BMI'] = (data['Weight']/data['Height']**2)
    return data

def cat_objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.01, 10),
        'iterations': trial.suggest_int('iterations', 50, 1000),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        **params_cat
    }

    optuna_model = make_pipeline(
        FunctionTransformer(bmi),
        FeatureScaler,
        CatBoostClassifier(**params)
    )

    optuna_score = cross_val_score(optuna_model, X, y, scoring='accuracy', cv=SKF, n_jobs=-1)

    return np.mean(optuna_score)

In [14]:
if False :
    cat_study = optuna.create_study(direction = 'maximize')
    cat_study.optimize(cat_objective,n_trials=TRIALS, n_jobs=-1, show_progress_bar=True)
    print("")
    print(f'scores : {cat_study.best_value}, params : {cat_study.best_params}')
    with open('json/cat.json', 'w') as json_file:
        json.dump(cat_study.best_params, json_file, indent=4)

## RF

In [15]:
params_rf = {
        'random_state' : SEED,
}
def rf_objective(trial):
    liste = [None] + list(range(2, 50))
    params = {
        'n_estimators' : trial.suggest_int('n_estimators', 10, 500),
        'criterion' : trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
        'max_depth' : trial.suggest_categorical('max_leaf_nodes',liste ),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2, 30),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 2, 30),
        'min_weight_fraction_leaf' : trial.suggest_float('min_weight_fraction_leaf', 0, .5),
        'max_features' : trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        'max_leaf_nodes' : trial.suggest_categorical('max_leaf_nodes', liste),
        'min_impurity_decrease' : trial.suggest_float('min_impurity_decrease', 1e-9, 1e-2, log = True),
        'bootstrap' : trial.suggest_categorical("bootstrap", [True, False]),
        'class_weight' : trial.suggest_categorical("class_weight", ["balanced", "balanced_subsample", None]),

        **params_rf

    }

    optuna_model = make_pipeline(
        FunctionTransformer(FeaturesEncoding),
        FeatureScaler,
        RandomForestClassifier(**params)
    )

    optuna_score = cross_val_score(optuna_model, X, y, scoring='accuracy', cv=SKF, n_jobs=-1)

    return np.mean(optuna_score)

In [16]:
if False :
    rf_study = optuna.create_study(direction = 'maximize')
    rf_study.optimize(rf_objective,n_trials=TRIALS, n_jobs=-1, show_progress_bar=True)
    print("")
    print(f'scores : {rf_study.best_value}, params : {rf_study.best_params} ')
    with open('json/rf.json', 'w') as json_file:
        json.dump(rf_study.best_params, json_file, indent=4)

## Summary

In [20]:
try :
    del XGB, LGBM, CAT, RF
except :
    pass

from modules.model import XGB, LGBM, CAT, RF

In [21]:
for name, model in [('xgb ', XGB), ('lgbm', LGBM), ('cat ', CAT), ('rf  ', RF)] :
    scores = cross_val_score(model, X,y,scoring='accuracy',cv=SKF, n_jobs=-1)
    print(f'{name} - Mean score  : {np.mean(scores):.5f} ± {np.std(scores):.5f}')

xgb  - Mean score  : 0.90934 ± 0.00332
lgbm - Mean score  : 0.90890 ± 0.00270
cat  - Mean score  : 0.90722 ± 0.00562
rf   - Mean score  : 0.88111 ± 0.00520
