In [1]:
import pandas as pd
import numpy as np
import random
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.under_sampling import  TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek



In [2]:
n=10000
skip = sorted(random.sample(range(1, 595212), 595212-n))
data = pd.read_csv('data/train.csv', skiprows=skip)
data.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,313,0,1,1,5,1,0,0,1,0,...,3,1,4,7,1,0,0,0,0,0
1,586,0,2,1,3,0,0,0,0,0,...,6,5,2,6,0,0,0,0,0,0
2,846,0,7,1,9,0,0,1,0,0,...,9,3,5,5,0,1,1,0,1,0
3,1021,0,1,1,9,0,0,1,0,0,...,7,1,1,8,0,0,1,0,1,0
4,1046,0,7,1,5,0,6,0,0,1,...,3,0,1,10,0,0,0,0,1,0


In [3]:
x = data.iloc[:, 2:]
x.head()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,1,1,5,1,0,0,1,0,0,0,...,3,1,4,7,1,0,0,0,0,0
1,2,1,3,0,0,0,0,0,1,0,...,6,5,2,6,0,0,0,0,0,0
2,7,1,9,0,0,1,0,0,0,0,...,9,3,5,5,0,1,1,0,1,0
3,1,1,9,0,0,1,0,0,0,0,...,7,1,1,8,0,0,1,0,1,0
4,7,1,5,0,6,0,0,1,0,0,...,3,0,1,10,0,0,0,0,1,0


In [4]:
y = data['target']

In [5]:
cat_col = [i for i in range(0, len(x.columns)) if 'cat' in x.columns[i]]
cat_col

[1, 3, 4, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]

In [6]:
tl = TomekLinks()
x_tl, y_tl = tl.fit_resample(x,y)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_tl, y_tl)
model = CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, eval_metric='Accuracy', verbose=10)
model.fit(x_train, y_train, cat_col, eval_set=(x_test, y_test))

0:	learn: 0.9638424	test: 0.9615540	best: 0.9615540 (0)	total: 177ms	remaining: 52.8s
10:	learn: 0.9638424	test: 0.9615540	best: 0.9615540 (0)	total: 481ms	remaining: 12.6s
20:	learn: 0.9638424	test: 0.9615540	best: 0.9615540 (0)	total: 742ms	remaining: 9.86s
30:	learn: 0.9638424	test: 0.9615540	best: 0.9615540 (0)	total: 1.02s	remaining: 8.82s
40:	learn: 0.9638424	test: 0.9615540	best: 0.9615540 (0)	total: 1.43s	remaining: 9.06s
50:	learn: 0.9639773	test: 0.9615540	best: 0.9615540 (0)	total: 1.83s	remaining: 8.93s
60:	learn: 0.9639773	test: 0.9615540	best: 0.9615540 (0)	total: 2.22s	remaining: 8.71s
70:	learn: 0.9639773	test: 0.9615540	best: 0.9615540 (0)	total: 2.56s	remaining: 8.27s
80:	learn: 0.9639773	test: 0.9615540	best: 0.9615540 (0)	total: 2.97s	remaining: 8.03s
90:	learn: 0.9639773	test: 0.9615540	best: 0.9615540 (0)	total: 3.33s	remaining: 7.65s
100:	learn: 0.9641123	test: 0.9615540	best: 0.9615540 (0)	total: 3.71s	remaining: 7.32s
110:	learn: 0.9643821	test: 0.9615540	best:

<catboost.core.CatBoostClassifier at 0x2438b8bee30>

In [8]:
y_pred = model.predict(x_test)

In [9]:
cm =  confusion_matrix(y_true=y_test, y_pred=y_pred)
cm

array([[2376,    0],
       [  95,    0]], dtype=int64)

In [10]:
print(classification_report(y_true=y_test, y_pred=y_pred, zero_division=True))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2376
           1       1.00      0.00      0.00        95

    accuracy                           0.96      2471
   macro avg       0.98      0.50      0.49      2471
weighted avg       0.96      0.96      0.94      2471



In [11]:
sm = SMOTE()
x_sm, y_sm = sm.fit_resample(x,y)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm)
model = CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, eval_metric='Accuracy', verbose=10)
model.fit(x_train, y_train, cat_col, eval_set=(x_test, y_test))

0:	learn: 0.8007610	test: 0.8003735	best: 0.8003735 (0)	total: 62.4ms	remaining: 18.7s
10:	learn: 0.9296437	test: 0.9279934	best: 0.9279934 (10)	total: 712ms	remaining: 18.7s
20:	learn: 0.9547561	test: 0.9491596	best: 0.9491596 (20)	total: 1.33s	remaining: 17.6s
30:	learn: 0.9624351	test: 0.9580826	best: 0.9580826 (30)	total: 1.9s	remaining: 16.5s
40:	learn: 0.9675545	test: 0.9626479	best: 0.9626479 (40)	total: 2.55s	remaining: 16.1s
50:	learn: 0.9703217	test: 0.9661756	best: 0.9661756 (49)	total: 3.1s	remaining: 15.1s
60:	learn: 0.9725355	test: 0.9711558	best: 0.9711558 (59)	total: 3.77s	remaining: 14.8s
70:	learn: 0.9753718	test: 0.9726084	best: 0.9726084 (70)	total: 4.52s	remaining: 14.6s
80:	learn: 0.9772397	test: 0.9744760	best: 0.9746835 (79)	total: 5.13s	remaining: 13.9s
90:	learn: 0.9777240	test: 0.9759286	best: 0.9759286 (90)	total: 5.74s	remaining: 13.2s
100:	learn: 0.9786233	test: 0.9767587	best: 0.9767587 (100)	total: 6.39s	remaining: 12.6s
110:	learn: 0.9789000	test: 0.977

<catboost.core.CatBoostClassifier at 0x243f8e3b160>

In [13]:
y_pred = model.predict(x_test)

In [14]:
cm =  confusion_matrix(y_true=y_test, y_pred=y_pred)
cm

array([[2391,    5],
       [  91, 2332]], dtype=int64)

In [15]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2396
           1       1.00      0.96      0.98      2423

    accuracy                           0.98      4819
   macro avg       0.98      0.98      0.98      4819
weighted avg       0.98      0.98      0.98      4819



In [16]:
smt = SMOTETomek()
x_smt, y_smt = smt.fit_resample(x,y)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x_smt, y_smt)
model = CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, eval_metric='Accuracy', verbose=10)
model.fit(x_train, y_train, cat_col, eval_set=(x_test, y_test))

0:	learn: 0.7841577	test: 0.7879228	best: 0.7879228 (0)	total: 64.9ms	remaining: 19.4s
10:	learn: 0.9322034	test: 0.9367089	best: 0.9367089 (10)	total: 665ms	remaining: 17.5s
20:	learn: 0.9517122	test: 0.9587051	best: 0.9587051 (20)	total: 1.29s	remaining: 17.2s
30:	learn: 0.9625735	test: 0.9655530	best: 0.9659680 (28)	total: 1.97s	remaining: 17.1s
40:	learn: 0.9695607	test: 0.9711558	best: 0.9711558 (40)	total: 2.56s	remaining: 16.1s
50:	learn: 0.9733656	test: 0.9730234	best: 0.9730234 (50)	total: 3.18s	remaining: 15.5s
60:	learn: 0.9759253	test: 0.9763436	best: 0.9763436 (59)	total: 3.77s	remaining: 14.8s
70:	learn: 0.9769630	test: 0.9769662	best: 0.9771737 (67)	total: 4.41s	remaining: 14.2s
80:	learn: 0.9779315	test: 0.9782112	best: 0.9782112 (77)	total: 5.05s	remaining: 13.7s
90:	learn: 0.9787617	test: 0.9780037	best: 0.9788338 (88)	total: 5.67s	remaining: 13s
100:	learn: 0.9793151	test: 0.9788338	best: 0.9788338 (88)	total: 6.27s	remaining: 12.4s
110:	learn: 0.9798686	test: 0.9798

<catboost.core.CatBoostClassifier at 0x2438b8bed40>

In [18]:
y_pred = model.predict(x_test)

In [19]:
cm =  confusion_matrix(y_true=y_test, y_pred=y_pred)
cm

array([[2408,   11],
       [  81, 2319]], dtype=int64)

In [20]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2419
           1       1.00      0.97      0.98      2400

    accuracy                           0.98      4819
   macro avg       0.98      0.98      0.98      4819
weighted avg       0.98      0.98      0.98      4819

