In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('white')

%matplotlib inline

In [2]:
# read data
data = pd.read_csv(
    'data/credit_train_label.csv'
)

In [3]:
data.drop('X', axis=1, inplace=True)

In [4]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

In [5]:
# Windsor
for feature in X.columns:
    X.loc[X[feature] > X[feature].quantile(0.99), feature] = X[feature].quantile(0.999)

In [6]:
# separate test and train data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=433)

In [7]:
monthly_income_median = X_train['MonthlyIncome'].median()
number_of_dependents = X_train['NumberOfDependents'].median()

In [8]:
X_train['MonthlyIncome'].fillna(monthly_income_median, inplace=True)
X_train['NumberOfDependents'].fillna(number_of_dependents, inplace=True)

X_test['MonthlyIncome'].fillna(monthly_income_median, inplace=True)
X_test['NumberOfDependents'].fillna(number_of_dependents, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [9]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train) # fitted on train data only

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

sm = SMOTE(random_state=433)
# ros = RandomOverSampler(random_state=433)
X_res, y_res = sm.fit_sample(X_train, y_train)
# X_res, y_res = ros.fit_sample(X_train, y_train)

In [11]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(C=1e5)
logit.fit(X_train, y_train)
y_hat = logit.predict(X_test)
y_probs = logit.predict_proba(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

print(confusion_matrix(y_test, y_hat))
print()
print(classification_report(y_test, y_hat))
print()
print(roc_auc_score(y_test, y_probs[:,1]))

[[11672    13]
 [  780    35]]

             precision    recall  f1-score   support

          0       0.94      1.00      0.97     11685
          1       0.73      0.04      0.08       815

avg / total       0.92      0.94      0.91     12500


0.6885033772520482


In [24]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)

rf.fit(X_train, y_train)
y_hat = rf.predict(X_test)
y_probs = rf.predict_proba(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

print(confusion_matrix(y_test, y_hat))
print()
print(classification_report(y_test, y_hat))
print()
print(roc_auc_score(y_test, y_probs[:,1]))

[[11540   145]
 [  651   164]]

             precision    recall  f1-score   support

          0       0.95      0.99      0.97     11685
          1       0.53      0.20      0.29       815

avg / total       0.92      0.94      0.92     12500


0.8365989116139144


In [27]:
sorted(list(zip(rf.feature_importances_, X.columns)))

[(0.03656657308093342, 'NumberRealEstateLoansOrLines'),
 (0.04568938927934402, 'NumberOfDependents'),
 (0.05016181773118108, 'NumberOfTime60.89DaysPastDueNotWorse'),
 (0.051637683748290246, 'NumberOfTime30.59DaysPastDueNotWorse'),
 (0.09121272139235792, 'NumberOfTimes90DaysLate'),
 (0.09301331696417621, 'NumberOfOpenCreditLinesAndLoans'),
 (0.13160799616318014, 'age'),
 (0.1463735474228535, 'MonthlyIncome'),
 (0.17030302624641966, 'DebtRatio'),
 (0.18343392797126376, 'RevolvingUtilizationOfUnsecuredLines')]

In [16]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
import pprint

pprint.pprint(random_grid)

{'bootstrap': [True],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 155, 211, 266, 322, 377, 433, 488, 544, 600]}


In [17]:
%%time
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=433, n_jobs = -1, scoring='roc_auc')

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_estimators=155, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=60, bootstrap=True 
[CV] n_estimators=155, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=60, bootstrap=True 
[CV] n_estimators=155, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=60, bootstrap=True 
[CV] n_estimators=211, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=100, bootstrap=True 
[CV]  n_estimators=155, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=60, bootstrap=True, total=  25.4s
[CV] n_estimators=211, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=100, bootstrap=True 
[CV]  n_estimators=155, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=60, bootstrap=True, total=  25.6s
[CV] n_estimators=211, min_samples_split=10, min_samples_leaf=2, max_features=auto, max_depth=100, bootstrap=

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.1min finished


CPU times: user 1min 2s, sys: 240 ms, total: 1min 2s
Wall time: 8min 13s


In [19]:
rf_random.best_params_

{'n_estimators': 544,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 50,
 'bootstrap': True}

In [18]:
y_hat = rf_random.predict(X_test)
y_probs = rf_random.predict_proba(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

print(confusion_matrix(y_test, y_hat))
print()
print(classification_report(y_test, y_hat))
print()
print(roc_auc_score(y_test, y_probs[:,1]))

[[11577   108]
 [  672   143]]

             precision    recall  f1-score   support

          0       0.95      0.99      0.97     11685
          1       0.57      0.18      0.27       815

avg / total       0.92      0.94      0.92     12500


0.8551123431802611


In [21]:
from sklearn.ensemble import GradientBoostingClassifier

params = {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 2,
          'learning_rate': 0.01, 'random_state': 42}

clf = GradientBoostingClassifier(**params)

clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
y_probs = clf.predict_proba(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

print(confusion_matrix(y_test, y_hat))
print()
print(classification_report(y_test, y_hat))
print()
print(roc_auc_score(y_test, y_probs[:,1]))

[[11585   100]
 [  665   150]]

             precision    recall  f1-score   support

          0       0.95      0.99      0.97     11685
          1       0.60      0.18      0.28       815

avg / total       0.92      0.94      0.92     12500


0.8570179376317495


In [28]:
tuned_parameters = [{'n_estimators': [10, 100, 300, 600],
                     'max_depth' : [3, 10],
                     'min_samples_split': [2, 10],
                     'learning_rate': [0.001, 0.1], 
                     'subsample': [0.5, 1]}]

In [30]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='roc_auc')
clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [31]:
tuned_parameters = {'n_estimators': [10, 100, 300, 600],
                     'max_depth' : [3, 10, 15],
                     'min_samples_split': [2, 6, 10],
                     'learning_rate': [0.001, 0.01, 0.1, 1], 
                     'subsample': [0.5, 1]}

In [33]:
%%time
# Use the random grid to search for best hyperparameters
# First create the base model to tune
clf = GradientBoostingClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
clf_random = RandomizedSearchCV(estimator = clf, param_distributions = tuned_parameters, n_iter = 20, cv = 3, verbose=2, random_state=433, n_jobs = -1, scoring='roc_auc')

# Fit the random search model
clf_random.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] subsample=1, n_estimators=100, min_samples_split=2, max_depth=15, learning_rate=0.01 
[CV] subsample=1, n_estimators=100, min_samples_split=2, max_depth=15, learning_rate=0.01 
[CV] subsample=1, n_estimators=100, min_samples_split=2, max_depth=15, learning_rate=0.01 
[CV] subsample=1, n_estimators=300, min_samples_split=10, max_depth=3, learning_rate=1 
[CV]  subsample=1, n_estimators=300, min_samples_split=10, max_depth=3, learning_rate=1, total=  38.5s
[CV] subsample=1, n_estimators=300, min_samples_split=10, max_depth=3, learning_rate=1 
[CV]  subsample=1, n_estimators=300, min_samples_split=10, max_depth=3, learning_rate=1, total=  29.5s
[CV] subsample=1, n_estimators=300, min_samples_split=10, max_depth=3, learning_rate=1 
[CV]  subsample=1, n_estimators=300, min_samples_split=10, max_depth=3, learning_rate=1, total=  30.6s
[CV] subsample=1, n_estimators=10, min_samples_split=10, max_depth=3, learning_rate=1 
[CV]  

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 42.4min


[CV]  subsample=0.5, n_estimators=600, min_samples_split=2, max_depth=15, learning_rate=0.1, total= 9.8min
[CV] subsample=1, n_estimators=300, min_samples_split=6, max_depth=3, learning_rate=1 
[CV]  subsample=0.5, n_estimators=300, min_samples_split=6, max_depth=15, learning_rate=0.1, total= 6.6min
[CV] subsample=1, n_estimators=300, min_samples_split=6, max_depth=3, learning_rate=1 
[CV]  subsample=1, n_estimators=300, min_samples_split=6, max_depth=3, learning_rate=1, total=  37.3s
[CV] subsample=1, n_estimators=300, min_samples_split=2, max_depth=3, learning_rate=0.1 
[CV]  subsample=1, n_estimators=300, min_samples_split=6, max_depth=3, learning_rate=1, total=  37.1s
[CV] subsample=1, n_estimators=300, min_samples_split=2, max_depth=3, learning_rate=0.1 
[CV]  subsample=1, n_estimators=300, min_samples_split=6, max_depth=3, learning_rate=1, total=  37.0s
[CV] subsample=1, n_estimators=300, min_samples_split=2, max_depth=3, learning_rate=0.1 
[CV]  subsample=1, n_estimators=300, mi

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 62.6min finished


CPU times: user 22.7 s, sys: 707 ms, total: 23.4 s
Wall time: 1h 3min 16s


In [34]:
clf_random.best_params_

{'subsample': 1,
 'n_estimators': 300,
 'min_samples_split': 2,
 'max_depth': 3,
 'learning_rate': 0.1}