In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/pasha00786/AmEx/master/cate_data.csv')
data.drop(['application_key', 'Unnamed: 0'], axis = 1, inplace = True)

In [0]:
X = data.drop('default_ind', axis = 1)
y = data['default_ind']

In [5]:
enc = OneHotEncoder()
enc_X = enc.fit_transform(X)
enc_X.shape

(80000, 299)

In [0]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

In [0]:
X_train, X_test, y_train, y_test = train_test_split(enc_X, y, test_size=0.25, random_state=42)

In [0]:
#Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
glmMod = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
                            intercept_scaling=1, class_weight=None, 
                            random_state=None, solver='liblinear', max_iter=100,
                            multi_class='ovr', verbose=2)

In [9]:
glmMod.fit(X_train, y_train)
glmMod.score(X_test, y_test)

[LibLinear]

0.80685

In [10]:
test_labels=glmMod.predict_proba(X_test)[:,1]
roc_auc_score(y_test,test_labels , average='macro', sample_weight=None)

0.8364420553410093

In [0]:
#GradientBoosting

In [0]:
gbMod = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=200, subsample=1.0,
                                   min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                   max_depth=3,
                                   init=None, random_state=None, max_features=None, verbose=0)

In [13]:
gbMod.fit(X_train, y_train)
gbMod.score(X_test, y_test)

0.8065

In [14]:
test_labels=gbMod.predict_proba(X_test)[:,1]
roc_auc_score(y_test,test_labels , average='macro', sample_weight=None)

0.8373464485257626

In [0]:
#AdaBoost

In [0]:
adaMod = AdaBoostClassifier(base_estimator=None, n_estimators=200, learning_rate=1.0)

In [17]:
adaMod.fit(X_train, y_train)
adaMod.score(X_test, y_test)

0.80505

In [18]:
test_labels=adaMod.predict_proba(X_test)[:,1]
roc_auc_score(y_test,test_labels , average='macro', sample_weight=None)

0.8355581852384645

In [0]:
#RandomForest

In [0]:
rfMod = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                               min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                               max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, 
                               random_state=None, verbose=0)

In [21]:
rfMod.fit(X_train, y_train)
rfMod.score(X_test, y_test)

0.78465

In [22]:
test_labels=rfMod.predict_proba(X_test)[:,1]
roc_auc_score(y_test,test_labels , average='macro', sample_weight=None)

0.7878306441408418

In [0]:
#HyperTuning

In [0]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [0]:
gbHyperParams = {'loss' : ['deviance', 'exponential'],
                 'n_estimators': randint(10, 500),
                 'max_depth': randint(1,10)}

In [13]:
gridSearchGB = RandomizedSearchCV(estimator=gbMod, param_distributions=gbHyperParams, n_iter=5,
                                   scoring='roc_auc', fit_params=None, cv=3, verbose=2).fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] loss=deviance, max_depth=7, n_estimators=113 ....................
[CV] ..... loss=deviance, max_depth=7, n_estimators=113, total= 1.3min
[CV] loss=deviance, max_depth=7, n_estimators=113 ....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] ..... loss=deviance, max_depth=7, n_estimators=113, total= 1.3min
[CV] loss=deviance, max_depth=7, n_estimators=113 ....................
[CV] ..... loss=deviance, max_depth=7, n_estimators=113, total= 1.3min
[CV] loss=exponential, max_depth=1, n_estimators=405 .................
[CV] .. loss=exponential, max_depth=1, n_estimators=405, total=  19.1s
[CV] loss=exponential, max_depth=1, n_estimators=405 .................
[CV] .. loss=exponential, max_depth=1, n_estimators=405, total=  19.1s
[CV] loss=exponential, max_depth=1, n_estimators=405 .................
[CV] .. loss=exponential, max_depth=1, n_estimators=405, total=  19.0s
[CV] loss=exponential, max_depth=6, n_estimators=180 .................
[CV] .. loss=exponential, max_depth=6, n_estimators=180, total= 1.4min
[CV] loss=exponential, max_depth=6, n_estimators=180 .................
[CV] .. loss=exponential, max_depth=6, n_estimators=180, total= 1.4min
[CV] loss=exponential, max_depth=6, n_estimators=180 .................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 19.0min finished


In [14]:
gridSearchGB.best_params_, gridSearchGB.best_score_

({'loss': 'exponential', 'max_depth': 1, 'n_estimators': 405},
 0.8274550536335845)

In [0]:
bestGbModFitted = gridSearchGB.best_estimator_.fit(X_train, y_train)

In [16]:
test_labels=bestGbModFitted.predict_proba(X_test)[:,1]
roc_auc_score(y_test,test_labels , average='macro', sample_weight=None)

0.834110609128156

In [0]:
from sklearn.ensemble import VotingClassifier
votingMod = VotingClassifier(estimators=[('gb', bestGbModFitted), 
                                         ('ada', adaMod)], voting='soft',weights=[2,1])
votingMod = votingMod.fit(X_train, y_train)

In [21]:
votingMod.score(X_train, y_train)

  if diff:


0.8003166666666667

In [22]:
test_labels=votingMod.predict_proba(X_test)[:,1]
roc_auc_score(y_test,test_labels , average='macro', sample_weight=None)

0.8341365020490783

In [23]:
test_labels

array([0.16797415, 0.41578175, 0.18660271, ..., 0.2025538 , 0.19329769,
       0.34536585])