In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [65]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [66]:
df_train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [67]:
df_train = df_train.drop('id', axis=1)

In [68]:
def convert_to_number(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category').cat.codes
            #normalize the data
            df[col] = df[col]/df[col].max()
    return df

In [69]:
df_train = convert_to_number(df_train)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('target', axis=1).values, df_train['target'].values, test_size=0.2, random_state=42)    

In [71]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [72]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [73]:
#check the accuracy of the model
y_pred = xgb.predict(X_test)
accuracy_score(y_test, y_pred)

0.8481666666666666

In [84]:
#find the best parameters using optuna
from optuna import study, distributions

In [85]:
def objective(trial):
    xgb = XGBClassifier(
        n_estimators=trial.suggest_int('n_estimators', 10, 1000),
        max_depth=trial.suggest_int('max_depth', 1, 10),
        learning_rate=trial.suggest_uniform('learning_rate', 0.01, 0.1),
    )
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    return 1 - accuracy_score(y_test, y_pred)


In [86]:
study = study.create_study()
study.optimize(objective, n_trials=100)

[32m[I 2022-06-21 15:09:37,647][0m A new study created in memory with name: no-name-5311c163-a642-426e-988f-b6c4e17c081a[0m
[32m[I 2022-06-21 15:09:45,202][0m Trial 0 finished with value: 0.15393333333333337 and parameters: {'n_estimators': 120, 'max_depth': 7, 'learning_rate': 0.034936889330977765}. Best is trial 0 with value: 0.15393333333333337.[0m
[32m[I 2022-06-21 15:10:23,404][0m Trial 1 finished with value: 0.15078333333333338 and parameters: {'n_estimators': 813, 'max_depth': 6, 'learning_rate': 0.06093755472717522}. Best is trial 1 with value: 0.15078333333333338.[0m
[32m[I 2022-06-21 15:10:59,848][0m Trial 2 finished with value: 0.1508166666666667 and parameters: {'n_estimators': 614, 'max_depth': 7, 'learning_rate': 0.02574701917784459}. Best is trial 1 with value: 0.15078333333333338.[0m
[32m[I 2022-06-21 15:11:49,469][0m Trial 3 finished with value: 0.15085000000000004 and parameters: {'n_estimators': 995, 'max_depth': 6, 'learning_rate': 0.011965412606623621

In [5]:
data = df_train.sample(frac=0.9, random_state=786)
data_unseen = df_train.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (270000, 32)
Unseen Data For Predictions: (30000, 32)


In [7]:
from pycaret.classification import *
exp_mclf101 = setup(data = data, target = 'target', session_id=123, use_gpu = True) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(270000, 32)"
5,Missing Values,False
6,Numeric Features,12
7,Categorical Features,19
8,Ordinal Features,False
9,High Cardinality Features,False


In [8]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.848,0.8884,0.6172,0.7617,0.6819,0.5834,0.589,1.075
catboost,CatBoost Classifier,0.8478,0.8882,0.61,0.7659,0.6791,0.581,0.5875,5.368
xgboost,Extreme Gradient Boosting,0.8474,0.8883,0.6176,0.7595,0.6812,0.5823,0.5877,3.574
rf,Random Forest Classifier,0.8467,0.8852,0.5864,0.7782,0.6688,0.5716,0.5813,19.314
et,Extra Trees Classifier,0.8457,0.8818,0.5921,0.7704,0.6696,0.5712,0.5796,30.574
gbc,Gradient Boosting Classifier,0.8437,0.8807,0.5967,0.7596,0.6683,0.568,0.5751,132.059
ridge,Ridge Classifier,0.8436,0.0,0.594,0.7612,0.6673,0.5671,0.5746,0.804
lda,Linear Discriminant Analysis,0.843,0.8789,0.6152,0.7457,0.6742,0.5721,0.5767,12.285
ada,Ada Boost Classifier,0.841,0.8762,0.5924,0.7525,0.6629,0.5608,0.5676,26.962
dt,Decision Tree Classifier,0.7759,0.7139,0.5825,0.5746,0.5785,0.426,0.426,12.719


In [10]:
tuned_best = tune_model(best)

IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
