In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import copy


In [2]:
def standard_scalar_normalize(data_train, data_test): 
    scaler = StandardScaler()
    scaler.fit(data_train)
    normal_data_train = scaler.transform(data_train)
    normal_data_test = scaler.transform(data_test)
    return normal_data_train,normal_data_test

In [3]:
def data_processed():
    '''
    Used to read & normalize processed test and train data
    '''
    data_train=pd.read_csv("data/2022-02-07_LOANS_TRAIN.csv")
    data_test=pd.read_csv("data/2022-02-07_LOANS_TEST.csv")
    y_train=data_train['loan_status']
    data_train.drop(columns=['loan_status', 'id', 'issue_d_in_months', 'issue_d_year', 'zip_state_match', 
                               'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
                               'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
                               'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
                               'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA',
                               'WI', 'WV', 'WY'], inplace=True)
    
    data_test.drop(columns=['id', 'issue_d_in_months', 'issue_d_year', 'zip_state_match', 
                           'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI',
                           'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN',
                           'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH',
                           'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA',
                           'WI', 'WV', 'WY'], inplace=True)
    
    normal_data_train,normal_data_test=standard_scalar_normalize(data_train,data_test)
    return normal_data_train,normal_data_test,y_train

def AUC_score(y_ground_truth,y_predicted_probability):
    return roc_auc_score(y_ground_truth, y_predicted_probability)

def to_submission(ids, y_test_predicted_probability):
    y_test=pd.DataFrame(y_test_predicted_probability,columns=['loan_status'], index=ids)
    y_test.index.name = 'id'
    y_test.to_csv('data/submission.csv')
    return

In [4]:
X_train,X_test,Y_train=data_processed()
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, train_size=0.80, random_state=42)

In [5]:
parameters = {'n_estimators':  np.arange(100, 500, 100) , 'min_samples_leaf': np.arange(5, 71, 20), 'max_depth': np.arange(5, 30, 5)}
rfc = RandomForestClassifier(class_weight='balanced')
clf = GridSearchCV(rfc, parameters, scoring='roc_auc', verbose=2.1, n_jobs=1)
clf.fit(X_train, Y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] max_depth=5, min_samples_leaf=5, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=5, min_samples_leaf=5, n_estimators=100, score=0.679, total=  13.7s
[CV] max_depth=5, min_samples_leaf=5, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.8s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=5, n_estimators=100, score=0.688, total=  14.1s
[CV] max_depth=5, min_samples_leaf=5, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   27.9s remaining:    0.0s


[CV]  max_depth=5, min_samples_leaf=5, n_estimators=100, score=0.681, total=  15.1s
[CV] max_depth=5, min_samples_leaf=5, n_estimators=100 ...............
[CV]  max_depth=5, min_samples_leaf=5, n_estimators=100, score=0.679, total=  14.0s
[CV] max_depth=5, min_samples_leaf=5, n_estimators=100 ...............
[CV]  max_depth=5, min_samples_leaf=5, n_estimators=100, score=0.682, total=  15.1s
[CV] max_depth=5, min_samples_leaf=5, n_estimators=200 ...............
[CV]  max_depth=5, min_samples_leaf=5, n_estimators=200, score=0.679, total=  29.8s
[CV] max_depth=5, min_samples_leaf=5, n_estimators=200 ...............
[CV]  max_depth=5, min_samples_leaf=5, n_estimators=200, score=0.688, total=  28.2s
[CV] max_depth=5, min_samples_leaf=5, n_estimators=200 ...............
[CV]  max_depth=5, min_samples_leaf=5, n_estimators=200, score=0.680, total=  27.0s
[CV] max_depth=5, min_samples_leaf=5, n_estimators=200 ...............
[CV]  max_depth=5, min_samples_leaf=5, n_estimators=200, score=0.679, 

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed: 250.3min finished


GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'),
             n_jobs=1,
             param_grid={'max_depth': array([ 5, 10, 15, 20, 25]),
                         'min_samples_leaf': array([ 5, 25, 45, 65]),
                         'n_estimators': array([100, 200, 300, 400])},
             scoring='roc_auc', verbose=2.1)

In [6]:
clf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_min_samples_leaf', 'param_n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [7]:
results_df = pd.DataFrame(clf.cv_results_)

In [10]:
clf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_max_depth', 'param_min_samples_leaf', 'param_n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [12]:
results_sub_df = results_df.loc[:, ['param_max_depth', 'param_min_samples_leaf', 'param_n_estimators', 'mean_test_score', 'std_test_score', 'rank_test_score']]
results_sub_df.sort_values('rank_test_score').head(20)

Unnamed: 0,param_max_depth,param_min_samples_leaf,param_n_estimators,mean_test_score,std_test_score,rank_test_score
55,20,25,400,0.690195,0.003197,1
71,25,25,400,0.690187,0.002948,2
54,20,25,300,0.689924,0.003395,3
70,25,25,300,0.68988,0.003388,4
75,25,45,400,0.689726,0.003241,5
53,20,25,200,0.689721,0.003178,6
59,20,45,400,0.689698,0.003172,7
74,25,45,300,0.689593,0.003261,8
58,20,45,300,0.689557,0.003412,9
69,25,25,200,0.689551,0.003313,10


In [13]:
Y_val_pred_prob=clf.best_estimator_.predict_proba(X_val)[:,1]
AUC_score(Y_val,Y_val_pred_prob)

0.689535483443332

In [14]:
Y_train_pred_prob=clf.best_estimator_.predict_proba(X_train)[:,1]
AUC_score(Y_train,Y_train_pred_prob)

0.8672438705361283

In [15]:
Y_val_pred=clf.best_estimator_.predict(X_val)
confusion_matrix(Y_val,Y_val_pred)

array([[25769,  7579],
       [ 3229,  2873]])

In [16]:
Y_train_pred=clf.best_estimator_.predict(X_train)
confusion_matrix(Y_train,Y_train_pred)

array([[108347,  25331],
       [  6314,  17808]])

In [17]:
clf.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 25,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Output for submissions - evaluate on test set 

In [None]:
Y_test_pred_prob=clf.predict_proba(X_test)[:,1]

In [None]:
to_submission(pd.read_csv("data/2022-02-07_LOANS_TEST.csv")['id'], Y_test_pred_prob)