In [None]:
import pandas as pd
import numpy as np

import sys
sys.path.append('/home/nico/Projets/kaggle_competitions/playground-series-s4e8')
from variables import * 


In [None]:
train = pd.read_parquet('/home/nico/Projets/kaggle_competitions/playground-series-s4e8/data/train.parquet')
train[TARGET_COL] = train[TARGET_COL].map(dict_label_target)
train.head()

## FEATURES ENGINEERING WITH TARGET ENCODING

In [None]:
def get_dict_target_encoding(df,column,target) :

    dict_target = df[[column,target]].groupby([column],dropna=False).mean()[target].to_dict()
    
    return(dict_target)

In [None]:
dict_target_encoding = {}
for cat_col in CAT_COLS :
    train[cat_col] = train[cat_col].fillna('NA') # to simplify target encoding
    dict_target_encoding[cat_col] = get_dict_target_encoding(train,cat_col,TARGET_COL)

In [None]:
for col in CAT_COLS : 
    train[col] = train[col].map(dict_target_encoding[col])
train.head()

## XGBOOST 

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.metrics import matthews_corrcoef

In [None]:
X_train,X_test = train_test_split(train,train_size=0.25,random_state=42)

In [None]:
FEATURES_COL = CONT_COLS + CAT_COLS

In [None]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
}

# param_grid = {
#     'max_depth': [ 5],
#     'learning_rate': [ 0.01],
#     'subsample': [ 0.7]
# }

In [None]:
xgb_model = xgb.XGBClassifier(random_state = 42)
grid_search = GridSearchCV(xgb_model, param_grid, cv=2, scoring='accuracy',verbose=3)

In [None]:
grid_search.fit(X_train[FEATURES_COL], X_train[TARGET_COL])

In [None]:
# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
bst_estimator = grid_search.best_estimator_

In [None]:
preds = bst_estimator.predict(X_test[FEATURES_COL])
score = matthews_corrcoef(X_test[TARGET_COL], preds)
print('MCC:', score)

## Submission

In [None]:
test = pd.read_parquet('/home/nico/Projets/kaggle_competitions/playground-series-s4e8/data/test.parquet')

In [None]:
for cat_col in CAT_COLS :
    test[cat_col] = test[cat_col].fillna('NA')
    test[cat_col] = test[cat_col].map(dict_target_encoding[cat_col]).fillna(np.mean(list(dict_target_encoding[cat_col].values())) )

In [None]:
for cont_col in CONT_COLS :
    test[cont_col] = test[cont_col].fillna(test[cont_col].dropna().mean())

In [None]:
test['class'] = bst_estimator.predict(test[FEATURES_COL])
test['class'] = test['class'].map(dict_target_label)

In [None]:
test['class'].value_counts()

In [None]:
test[['id','class']].to_csv('submission.csv',index=False)