### `ENVIRONMENT SETUP`

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 1000)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Setting working dir
import os
path = 'G:\\ML - Datahack\\HDFC Bank\\Files'

os.chdir(path)

%pwd

'G:\\ML - Datahack\\HDFC Bank\\Files'

In [3]:
# IMPORTING TRAIN AND TEST FILES 
train = pd.read_csv('done_train.csv', low_memory=False)

train.drop_duplicates(inplace=True)

test = pd.read_csv('done_test.csv', low_memory=False)

In [4]:
imp = pd.read_csv('important_features.csv', low_memory=False)

In [5]:
imp.head()

Unnamed: 0,feature,importance_mean,importance_std
0,Col176,0.01365,0.011163
1,Col627,0.01231,0.008986
2,Col270,0.011915,0.010566
3,Col229,0.010954,0.003408
4,Col1258,0.010685,0.00901


In [6]:
features = imp[imp.importance_mean > 0.00].feature.values

print(f'{len(features)} Features selected...')
print(features)

194 Features selected...
['Col176' 'Col627' 'Col270' 'Col229' 'Col1258' 'Col25' 'Col634' 'Col1002'
 'Col218' 'Col108' 'Col75' 'Col166' 'Col1140' 'Col983' 'Col72' 'Col225'
 'Col988' 'Col1050' 'Col1021' 'Col112' 'Col872' 'Col681' 'Col111' 'Col162'
 'Col168' 'Col70' 'Col64' 'Col222' 'Col938' 'Col915' 'Col1020' 'Col352'
 'Col874' 'Col215' 'Col1229' 'Col1233' 'Col291' 'Col115' 'Col240' 'Col605'
 'Col1235' 'Col621' 'Col982' 'Col1527' 'Col174' 'Col1211' 'Col54' 'Col603'
 'Col300' 'Col127' 'Col1263' 'Col987' 'Col1273' 'Col633' 'Col727' 'Col125'
 'Col434' 'Col693' 'Col1072' 'Col680' 'Col160' 'Col1069' 'Col959' 'Col658'
 'Col876' 'Col725' 'Col986' 'Col672' 'Col1161' 'Col44' 'Col3' 'Col1346'
 'Col1123' 'Col302' 'Col1010' 'Col69' 'Col239' 'Col1139' 'Col715' 'Col23'
 'Col1283' 'Col694' 'Col175' 'Col236' 'Col1185' 'Col1359' 'Col322'
 'Col1167' 'Col1141' 'Col24' 'Col628' 'Col624' 'Col186' 'Col1029'
 'Col1285' 'Col305' 'Col970' 'Col116' 'Col237' 'Col185' 'Col1228' 'Col893'
 'Col224' 'Col994' 'Col325' 

In [7]:
target = train.Col2
train = train[features]

test_id = test.Col1
test = test[features]

In [8]:
train.shape

(16400, 194)

In [9]:
test.shape

(20442, 194)

### `CLASS SAMPLING`

In [10]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE, SMOTENC, SVMSMOTE

X = train
y = target

print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({0: 14764, 1: 1636})


In [11]:
sm = SMOTE(random_state=0, sampling_strategy = .35)
X_res, y_res = sm.fit_resample(X, y)

X_res = pd.DataFrame(X_res)
y_res = pd.Series(y_res)

print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 14764, 1: 5610})


In [12]:
cols = X.columns.values
train = pd.DataFrame(data = X_res.values, columns=cols)
train.head()

Unnamed: 0,Col176,Col627,Col270,Col229,Col1258,Col25,Col634,Col1002,Col218,Col108,Col75,Col166,Col1140,Col983,Col72,Col225,Col988,Col1050,Col1021,Col112,Col872,Col681,Col111,Col162,Col168,Col70,Col64,Col222,Col938,Col915,Col1020,Col352,Col874,Col215,Col1229,Col1233,Col291,Col115,Col240,Col605,Col1235,Col621,Col982,Col1527,Col174,Col1211,Col54,Col603,Col300,Col127,Col1263,Col987,Col1273,Col633,Col727,Col125,Col434,Col693,Col1072,Col680,Col160,Col1069,Col959,Col658,Col876,Col725,Col986,Col672,Col1161,Col44,Col3,Col1346,Col1123,Col302,Col1010,Col69,Col239,Col1139,Col715,Col23,Col1283,Col694,Col175,Col236,Col1185,Col1359,Col322,Col1167,Col1141,Col24,Col628,Col624,Col186,Col1029,Col1285,Col305,Col970,Col116,Col237,Col185,Col1228,Col893,Col224,Col994,Col325,Col705,Col870,Col113,Col118,Col32,Col233,Col62,Col1360,Col881,Col329,Col76,Col1215,Col9,Col1575,Col618,Col976,Col196,Col1617,Col293,Col63,Col1118,Col629,Col121,Col13,Col626,Col1387,Col637,Col45,Col980,Col46,Col935,Col198,Col936,Col953,Col666,Col182,Col277,Col267,Col183,Col1000,Col117,Col50,Col1073,Col84,Col120,Col179,Col892,Col257,Col1265,Col221,Col1631,Col989,Col1347,Col78,Col1322,Col238,Col269,Col1231,Col864,Col94,Col910,Col134,Col67,Col244,Col616,Col1127,Col82,Col1024,Col1212,Col180,Col591,Col190,Col241,Col1388,Col913,Col963,Col878,Col208,Col26,Col187,Col1014,Col945,Col1049,Col602,Col210,Col214,Col178,Col952,Col1383
0,143.278744,0.460104,1.400044,4.169,10.28,0.137398,0.0,10.5,1.4019,9.52,10.07,2.636174,9.68,9.31,9.7,0.2747,0.0,10.27,1.0,10.91,49.666667,-0.382431,14.76,2.565795,624714.2,3.0,10.7,4.0689,1.0,0.0,2.0,-1.0,4.015786,0.4673,1.0,8.4,0.106216,11.68,0.0,100.0,1.0,0.0,2.0,0.0,6.35,10.52,11.01,0.0,0.0,17.85,10.32,10.94,9.65,0.0,-0.357693,11.75,0.009733,0.212767,10.61,-0.255842,14.0,10.74,9.87,0.0,1.160181,-0.287263,9.82,5.485022,10.15,10.86,10867.11,12.97,10.22,0.0,10.32,9.19,0.0,0.0,-0.228054,0.993441,10.18,0.313858,0.983505,9.883509,2.0,0.0,0.0,9.61,0.0,1.29705,0.0,7.7392,0.333333,10.26,8.62,0.0,9.04,7.99,0.03253,-1.0,10.57,11.44,83.4713,1.0,9.883509,0.146766,0.755869,15.44,8.49,1.0,83.7461,10.54,0.0,1.099273,0.0,10.76,9.16,12855.97,10.5,0.0,9.82,4166.666667,10.52,0.0,10.65,9.67,5.485022,10.09,9.293496,92.2608,9.32,5.485022,3.0,1.0,2.0,12.66,85475.99833,9.32,1.0,0.0,35.666667,0.94223,1.201706,0.0,2.0,11.99,9.73,0.0,9.75,9.97,1.106012,11.04,0.545651,8.97,92.0561,10.62,1.0,0.0,10.46,8.29,0.0,0.567681,9.21,116.0,0.0,11.89,11.79,13.51,9.883509,0.0,9.24,10.1,3.0,1.0,33.961214,59606.33333,0.0,0.49364,10.44,1.0,2.0,39.0,0.9346,0.115987,0.0,8.85,8.88,9.59,0.0,0.0,91.5888,42.869348,1.0,10.06
1,45.899741,0.0,0.697194,33.797,8.55,0.753892,2.05843,7.61,51.2195,9.52,6.71,9.210323,9.61,8.3,10.09,54.3661,3.0,8.93,0.0,10.91,8.105263,0.45462,6.95,7.575385,5282476.0,2.0,8.59,0.0,0.0,0.0,0.0,-1.0,2.984058,39.0244,0.0,8.4,0.252478,8.34,0.000106,33.3333,0.0,0.0,0.0,0.0,8.01,10.52,10.12,0.0,26.6929,8.77,9.02,9.19,9.8,0.0,0.456898,8.24,2.751084,-0.184805,8.96,0.444989,2.0,9.83,8.76,0.0,1.057016,0.216304,9.71,0.720789,11.01,10.45,258431.2,12.97,10.16,0.0,10.32,7.43,0.0,0.0,0.444591,0.893203,8.96,0.018535,0.893896,0.0,1.0,0.0,0.000106,9.03,0.0,0.96662,0.720789,0.0,0.0,9.07,8.62,0.0024,9.04,7.99,2.355586,-1.0,10.57,8.1,0.0,1.0,0.512768,-0.288401,2.358974,7.97,8.49,4.0,66.2006,8.14,0.0,1.288429,0.0,10.76,10.75,172653.8,10.5,53.4251,8.23,0.0,10.52,17.0732,8.17,9.67,0.0,8.01,12.462385,65.5043,9.32,0.720789,0.0,1.0,2.0,9.16,0.0,9.32,0.0,0.0,8.2,0.648074,1.00523,0.0,3.0,7.85,10.13,0.0,9.75,9.9,0.892688,6.97,1.538968,8.97,46.3415,10.62,1.0,0.0,10.46,8.29,0.0,1.773933,9.21,28.0,0.0,6.52,10.2,8.14,0.512768,12.0792,9.24,10.1,3.0,3.0,37.955201,0.0,0.0,1.464364,9.48,0.0,0.0,47.0,0.0,0.808295,0.6,9.44,12.06,8.94,37.8378,0.0,0.0,53.606218,0.0,10.06
2,32.889272,0.0,0.697194,0.6662,8.55,0.808115,0.007294,7.61,10.0529,9.42,6.71,3.219582,9.61,8.3,10.24,34.5854,4.0,8.93,1.0,9.98,18.2,0.529382,6.95,3.334681,222613600.0,2.0,10.7,0.0,0.0,1.0,3.0,-1.0,2.271018,56.0847,1.0,8.4,0.185678,8.34,8.9e-05,33.3333,0.0,0.002,0.0,0.0,8.01,8.76,10.12,0.0,38.1546,8.77,9.02,9.19,9.8,6.8e-05,0.474372,8.24,1.313229,0.319497,10.61,0.595743,0.0,10.74,8.76,0.0,0.258884,0.458131,9.82,0.457378,10.15,10.86,14593550.0,8.06,10.16,0.0,9.04,9.19,0.0,0.0,0.628157,0.683582,8.96,0.018535,0.861632,0.00204,0.0,0.0,8.9e-05,9.03,0.0,0.735339,0.454443,0.0845,0.0,9.07,11.09,0.0029,9.7,8.04,1.052945,-1.0,9.51,11.44,0.067,2.0,0.451904,-0.04198,0.326923,7.97,8.13,12.0,49.4289,10.54,3.0,1.062294,0.0,10.76,10.75,5320778.0,10.5,33.4001,10.42,0.0,10.52,64.2857,8.17,10.49,0.002936,8.01,16.49609,46.9305,9.32,0.457378,1.0,2.0,2.0,8.72,25704.0,10.47,1.0,0.0,75.6,0.323529,0.668917,0.0,4.0,7.85,10.13,1.0,9.75,9.9,0.869714,11.04,0.770023,10.55,73.0159,10.62,0.0,1.0,10.46,8.29,0.158723,0.713389,9.21,22.0,1.0,13.03,11.79,8.14,0.451904,13.4436,9.24,10.02,2.0,2.0,21.80143,42840.0,0.4,0.020281,9.48,1.0,0.0,24.0,0.0,0.77691,10.2,9.84,8.88,8.94,37.8378,0.0,3.4392,41.019374,2.0,10.06
3,68.172589,0.0,0.697194,5.4153,8.55,-0.125829,0.657086,7.61,4.7009,9.52,10.07,2.778929,9.61,8.3,10.09,85.3576,4.0,8.93,0.0,10.91,8.105263,0.567619,12.98,2.489714,28215600.0,2.0,10.7,0.0,0.0,2.0,0.0,-1.0,3.06504,25.641,0.0,11.55,0.020233,9.69,0.001089,33.3333,0.0,0.0125,0.0,1.0,8.01,8.76,10.12,0.0,89.1879,8.77,9.02,9.19,9.8,0.001353,0.561449,8.24,6.92348,-0.297574,8.96,0.607406,0.0,9.83,8.76,0.0,1.057016,0.216304,9.6,0.92132,11.01,10.86,672704.3,9.09,10.16,0.0,10.32,9.19,0.0,1.0,0.615315,0.623499,8.96,0.018535,0.976665,0.03127,0.0,0.0,0.001089,9.03,1.0,0.534531,0.857938,0.0,0.0,9.07,11.09,0.0162,9.7,8.04,5.750353,-1.0,10.57,11.44,0.4642,2.0,0.579738,-0.334335,0.162824,7.97,8.13,24.0,93.9632,8.14,0.0,3.129732,0.000834,10.76,10.75,357713.4,10.5,84.6879,8.23,0.0,9.18,33.7607,8.17,9.78,0.063382,10.09,13.419061,93.1757,9.32,0.92132,1.0,0.0,2.0,8.72,25582.304,9.32,0.0,0.0,140.4,0.648074,1.00523,0.0,4.0,10.33,10.13,0.0,9.75,12.74,0.985943,11.04,0.670615,10.55,38.3191,10.62,0.0,2.0,10.46,8.29,0.0,0.728669,10.13,28.0,0.0,13.03,11.79,8.14,0.579738,7.9038,9.24,10.1,3.0,2.0,65.592425,42637.17333,0.0,0.364815,9.48,2.0,0.0,8.0,0.0,0.38883,11.6,9.84,8.44,10.56,37.8378,0.0,4.416,82.08045,0.0,10.06
4,39.549559,0.689233,0.697194,36.9616,8.55,-0.113101,0.0,7.61,56.5517,8.82,10.07,2.579341,9.68,8.3,10.24,56.7715,4.0,8.93,0.0,8.57,5.058824,0.476344,12.33,3.449007,472699100.0,1.0,9.51,0.0,0.0,0.0,3.0,-1.0,3.598077,15.7088,0.0,11.55,0.337865,11.68,0.001948,33.3333,0.0,0.0042,0.0,1.0,8.01,8.76,11.01,0.0,52.0165,8.77,9.02,9.19,9.8,0.00098,0.195424,8.24,0.226561,-0.288723,10.61,0.615176,8.0,9.83,8.76,0.0,0.578306,0.595712,9.71,0.170649,11.01,12.05,-1218742.0,9.09,10.16,0.0007,10.32,10.07,0.0,0.0,0.606261,0.705101,8.96,0.018535,0.988593,0.182265,0.0,0.0,0.002168,9.03,1.0,1.56343,0.0,0.1767,0.0,9.07,8.62,0.7336,9.99,8.04,0.276928,0.0,9.51,6.99,0.0,4.0,0.239788,-0.311127,0.10312,14.63,8.13,68.0,56.7715,10.08,1.0,4.82651,0.0,11.05,10.75,5938358.0,10.5,46.7919,9.82,0.0,9.18,13.41,11.89,9.67,0.124813,8.01,9.284795,46.7919,18.72,0.162723,2.0,1.0,1.0,8.72,0.0,9.32,1.0,0.0,261.0,0.666729,0.94338,29.0,4.0,11.99,10.13,2.0,9.75,9.9,0.98718,6.97,0.64369,10.55,15.7088,10.62,0.0,2.0,10.5,28.49,0.0,0.712341,9.21,12.0,2.0,6.52,11.79,19.1,0.230411,0.0,9.24,10.02,3.0,1.0,16.384499,0.0,0.0,0.0,9.48,0.0,0.0,5.0,0.0,0.617486,0.0,12.13,8.44,9.59,37.8378,0.0,0.0,31.008157,2.0,10.06


### `TRAIN TEST SPLIT`

In [13]:
from sklearn.model_selection import train_test_split

X = train
y = y_res

X, x_val, y, y_val = train_test_split(X, y, test_size = 0.3, random_state=0, stratify = y)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0, stratify = y, shuffle=True)

In [14]:
round(y_train.value_counts()/len(y_train)*100,2)

0    72.47
1    27.53
dtype: float64

In [15]:
round(y_test.value_counts()/len(y_test)*100,2)

0    72.45
1    27.55
dtype: float64

In [16]:
round(y.value_counts()/len(y)*100,2)

0    72.46
1    27.54
dtype: float64

In [17]:
round(y_val.value_counts()/len(y_val)*100,2)

0    72.47
1    27.53
dtype: float64

### `CLASSIFICATION MODELS`

In [18]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(random_state=0, loss_function = 'Logloss', eval_metric='F1', logging_level='Silent',
                         use_best_model=True, od_wait=20, max_depth=7,learning_rate=.1, l2_leaf_reg=10, iterations=150)

cat.fit(x_train, y_train, eval_set= (x_val, y_val), early_stopping_rounds=20)

<catboost.core.CatBoostClassifier at 0x20507804358>

In [19]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score

k = 20

cv = KFold(n_splits=k, random_state=0, shuffle=True)

predictions = cat.predict(x_test)

f1_score(y_test, predictions)

0.8304093567251463

### `PERFORMANCE METRICS`

#### `CLASSIFICATION REPORT`

In [20]:
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.90      0.99      0.95      2067
           1       0.98      0.72      0.83       786

   micro avg       0.92      0.92      0.92      2853
   macro avg       0.94      0.86      0.89      2853
weighted avg       0.92      0.92      0.91      2853



### `PERFORMANCE ON TEST DATA`

In [21]:
# Threshold value from above fig
cut = .5

predictions = cat.predict_proba(x_test)[:, 1]
# predictions = cat.predict(x_test)

y_pred = []

for x in predictions:
    if x > cut:
        y = 1
    else:
        y = 0
    y_pred.append(y)
    
f1_score(y_test, y_pred)

0.8304093567251463

### `PERFORMANCE ON VALIDATION DATA`

In [22]:
predictions = cat.predict_proba(x_val)[:, 1]

y_pred = []

for x in predictions:
    if x > cut:
        y = 1
    else:
        y = 0
    y_pred.append(y)
    
f1_score(y_val, y_pred)

0.8381209868198717

### `MAKING PREDICTIONS`

In [23]:
predictions = cat.predict_proba(test)[:, 1]

y_pred = []

for x in predictions:
    if x > cut:
        y = 1
    else:
        y = 0
    y_pred.append(y)

### `SAVING FILE`

In [24]:
file = pd.DataFrame(data={'Col1':test_id , 'Col2':y_pred})

In [25]:
file.Col2.value_counts()/len(file)

0    0.985422
1    0.014578
Name: Col2, dtype: float64

In [26]:
file.to_csv('submission.csv', index=False)