In [8]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import itertools

from math import floor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [2]:
application = pd.read_csv("/Users/anye/Downloads/2022_DS_Data/archive/application_record.csv", encoding = 'utf-8') 
record = pd.read_csv("/Users/anye/Downloads/2022_DS_Data/archive/credit_record.csv", encoding = 'utf-8')  

In [3]:
begin_month=pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
end_month = pd.DataFrame(record.groupby(['ID'])['MONTHS_BALANCE'].agg(max))
begin_month=begin_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) 
end_month = end_month.rename(columns={'MONTHS_BALANCE':'end_month'})
new_data_stage=pd.merge(application,begin_month,how="left",on="ID") #merge to record data
new_data = pd.merge(new_data_stage,end_month,how='left',on='ID')
new_data['window'] = new_data['end_month'] - new_data['begin_month']


In [4]:
record['OverDue_60D'] = None
record['OverDue_60D'][record['STATUS'] =='2']='Yes' 
record['OverDue_60D'][record['STATUS'] =='3']='Yes' 
record['OverDue_60D'][record['STATUS'] =='4']='Yes' 
record['OverDue_60D'][record['STATUS'] =='5']='Yes' 
cpunt=record.groupby('ID').count()
cpunt['OverDue_60D'][cpunt['OverDue_60D'] > 0]='Yes' 
cpunt['OverDue_60D'][cpunt['OverDue_60D'] == 0]='No' 
cpunt = cpunt[['OverDue_60D']]
combine=pd.merge(new_data,cpunt,how='inner',on='ID')
combine['target']=combine['OverDue_60D']
combine.loc[combine['target']=='Yes','target']=1
combine.loc[combine['target']=='No','target']=0

In [5]:
combine['income_modified'] = None
combine['income_modified'][combine['AMT_INCOME_TOTAL'] <=50000]='0~50K' 
combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >50000) & (combine['AMT_INCOME_TOTAL'] <=100000)] ='50K~100K'

combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >100000) & (combine['AMT_INCOME_TOTAL'] <=200000)]='100K~200K'  
combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >200000) & (combine['AMT_INCOME_TOTAL'] <=400000)]='200K~400K'  
combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >400000) & (combine['AMT_INCOME_TOTAL'] <=600000)]='400K~600K'  
combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >600000) & (combine['AMT_INCOME_TOTAL'] <=800000)]='600K~800K' 

combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >800000) & (combine['AMT_INCOME_TOTAL'] <=1000000)]='800K~1000K'  
combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >1000000) & (combine['AMT_INCOME_TOTAL'] <=1200000)]='1000K~1200K'  
combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >1200000) & (combine['AMT_INCOME_TOTAL'] <=1400000)]='1200K~1400K'  
combine['income_modified'][(combine['AMT_INCOME_TOTAL'] >1400000) & (combine['AMT_INCOME_TOTAL'] <=1600000)]='1400K~1600K'

In [6]:
combine['Occupation'] = combine['OCCUPATION_TYPE']
combine['Occupation'][(combine['OCCUPATION_TYPE'].isna()) & (combine['income_modified']=='0~50K')] = 'Sales staff'
combine['Occupation'][(combine['OCCUPATION_TYPE'].isna()) & ((combine['income_modified']=='50K~100K') | (combine['income_modified']=='100K~200K') | (combine['income_modified']=='200K~400K'))] = 'Laborers'
combine['Occupation'][(combine['OCCUPATION_TYPE'].isna()) & ((combine['income_modified']=='400K~600K') | (combine['income_modified']=='600K~800K') | (combine['income_modified']=='800K~1000K') | (combine['income_modified']=='1000K~1200K') | (combine['income_modified']=='1200K~1400K') | (combine['income_modified']=='1400K~1600K'))] = 'Managers'

In [7]:
combine['FLAG_OWN_CAR'] = combine['FLAG_OWN_CAR'].replace(['N','Y'],[0,1])
combine['FLAG_OWN_REALTY'] = combine['FLAG_OWN_REALTY'].replace(['N','Y'],[0,1])
combine['target'] = combine['target'].astype(int)

In [18]:
combine['Age'] = -(combine['DAYS_BIRTH'])//365
combine['DAYS_EMPLOYED'] = abs(combine['DAYS_EMPLOYED'])


In [19]:
num_columns = ['AMT_INCOME_TOTAL', 'Age', 'DAYS_EMPLOYED',
              'CNT_FAM_MEMBERS', 'begin_month', 'window']
cat_columns = ['CODE_GENDER', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
              'Occupation', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL']
target = ['target']
feature_columns = num_columns + cat_columns

In [20]:
X_all = combine[feature_columns]
y_all = combine[target]

In [21]:
X_new = X_all.copy()
X_new.drop(columns=['begin_month'], inplace=True)

In [22]:
onehot = OneHotEncoder(sparse=False)
col = ['CODE_GENDER', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'Occupation']
onehot.fit(X_new[col])

OneHotEncoder(sparse=False)

In [23]:
XX_new = pd.DataFrame(onehot.transform(X_new[col]), columns=onehot.get_feature_names_out(col))
X_new.drop(columns=col, axis=1, inplace=True)
X_df = pd.concat([X_new.reset_index(drop=True), XX_new], axis=1)
X_df

Unnamed: 0,AMT_INCOME_TOTAL,Age,DAYS_EMPLOYED,CNT_FAM_MEMBERS,window,FLAG_OWN_CAR,FLAG_OWN_REALTY,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,...,Occupation_Laborers,Occupation_Low-skill Laborers,Occupation_Managers,Occupation_Medicine staff,Occupation_Private service staff,Occupation_Realty agents,Occupation_Sales staff,Occupation_Secretaries,Occupation_Security staff,Occupation_Waiters/barmen staff
0,427500.0,32,4542,2.0,15.0,1,1,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,427500.0,32,4542,2.0,14.0,1,1,1,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,112500.0,58,1134,2.0,29.0,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,270000.0,52,3051,1.0,4.0,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,270000.0,52,3051,1.0,4.0,0,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,315000.0,47,2420,2.0,11.0,1,1,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36453,157500.0,33,1325,2.0,23.0,0,1,0,1,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
36454,157500.0,33,1325,2.0,32.0,0,1,0,1,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
36455,283500.0,49,655,2.0,9.0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                    y_all,
                                                    test_size=0.2,
                                                    stratify=y_all,
                                                    random_state=11)

In [25]:
def select_features(X_train, y_train, X_test, k_value='all'):
    fs = SelectKBest(score_func=chi2, k=k_value)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [26]:
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# what are scores for the features
for i in range(len(fs.scores_)):
    print('Feature %d: %f' % (i, fs.scores_[i]))

Feature 0: 15134.433322
Feature 1: 0.089672
Feature 2: 476428.807344
Feature 3: 0.336470
Feature 4: 3593.767128
Feature 5: 0.182440
Feature 6: 5.806511
Feature 7: 0.284977
Feature 8: 0.286830
Feature 9: 1.696069
Feature 10: 2.661204
Feature 11: 5.355944
Feature 12: 0.531512
Feature 13: 6.902361
Feature 14: 0.966988
Feature 15: 0.171945
Feature 16: 0.373717
Feature 17: 0.498640
Feature 18: 0.113326
Feature 19: 4.479257
Feature 20: 3.208734
Feature 21: 0.232512
Feature 22: 0.986602
Feature 23: 0.921919
Feature 24: 0.429283
Feature 25: 2.534760
Feature 26: 10.391136
Feature 27: 0.052921
Feature 28: 0.043265
Feature 29: 4.165475
Feature 30: 0.873607
Feature 31: 0.072328
Feature 32: 0.721789
Feature 33: 0.342490
Feature 34: 1.567507
Feature 35: 0.078233
Feature 36: 2.902673
Feature 37: 5.112780
Feature 38: 1.134835
Feature 39: 2.148934
Feature 40: 4.689414
Feature 41: 0.360138
Feature 42: 8.465527
Feature 43: 1.197284
Feature 44: 3.179607
Feature 45: 2.749224
Feature 46: 1.048863
Feature 47

In [28]:
names = []
values = []
for i in range(len(fs.scores_)):
    names.append(X_train.columns[i])
    values.append(fs.scores_[i])
chi_list = zip(names, values)

df = pd.DataFrame({"Feature": names,
                   "Importance": values})

In [29]:
col_final = list(df[df['Importance']>=5].Feature)
col_final

['AMT_INCOME_TOTAL',
 'DAYS_EMPLOYED',
 'window',
 'FLAG_OWN_REALTY',
 'CODE_GENDER_M',
 'NAME_INCOME_TYPE_Pensioner',
 'NAME_FAMILY_STATUS_Widow',
 'Occupation_Drivers',
 'Occupation_Low-skill Laborers',
 'Occupation_Sales staff']

In [31]:
option = ['Age']
col_1 = col_final + option
col_1

['AMT_INCOME_TOTAL',
 'DAYS_EMPLOYED',
 'window',
 'FLAG_OWN_REALTY',
 'CODE_GENDER_M',
 'NAME_INCOME_TYPE_Pensioner',
 'NAME_FAMILY_STATUS_Widow',
 'Occupation_Drivers',
 'Occupation_Low-skill Laborers',
 'Occupation_Sales staff',
 'Age']

In [32]:
X_df_1 = X_df[col_1]
# X_df_2 = X_df[col_final]

In [33]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_df_1,
                                                    y_all,
                                                    test_size=0.2,
                                                    stratify=y_all,
                                                    random_state=11)

In [34]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_df_2,
                                                    y_all,
                                                    test_size=0.2,
                                                    stratify=y_all,
                                                    random_state=11)

In [35]:
stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=11)

In [36]:
rf_pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', StandardScaler()],
                                
                                ['classifier', RandomForestClassifier()]])

In [37]:
rf_param_grid = {
              'classifier__max_depth': [9, 20], #default is None, which means infinity
              'classifier__min_samples_leaf': [10, 20],
              'classifier__min_samples_split': [5, 8],
              'classifier__n_estimators': [500, 800]
             }

In [39]:
rf_grid_search1 = GridSearchCV(estimator=rf_pipeline,
                           param_grid=rf_param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)
rf_grid_search2 = GridSearchCV(estimator=rf_pipeline,
                           param_grid=rf_param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

In [40]:
rf_grid_search1.fit(X_train_1, y_train_1)

  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fi

  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fi

  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fi

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=11, shuffle=True),
             estimator=Pipeline(steps=[['smote', SMOTE(random_state=11)],
                                       ['scaler', StandardScaler()],
                                       ['classifier',
                                        RandomForestClassifier()]]),
             n_jobs=-1,
             param_grid={'classifier__max_depth': [9, 20],
                         'classifier__min_samples_leaf': [10, 20],
                         'classifier__min_samples_split': [5, 8],
                         'classifier__n_estimators': [500, 800]},
             scoring='roc_auc')

In [41]:
rf_grid_search2.fit(X_train_2, y_train_2)

  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fi

  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fi

  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  self._final_estimator.fit(Xt, yt, **fi

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=11, shuffle=True),
             estimator=Pipeline(steps=[['smote', SMOTE(random_state=11)],
                                       ['scaler', StandardScaler()],
                                       ['classifier',
                                        RandomForestClassifier()]]),
             n_jobs=-1,
             param_grid={'classifier__max_depth': [9, 20],
                         'classifier__min_samples_leaf': [10, 20],
                         'classifier__min_samples_split': [5, 8],
                         'classifier__n_estimators': [500, 800]},
             scoring='roc_auc')

In [42]:
print(f'Cross-validation score: {rf_grid_search1.best_score_}\nTest score: {rf_grid_search1.score(X_test_1, y_test_1)}')

Cross-validation score: 0.7598493749931773
Test score: 0.7748095628536144


In [47]:
print(f'Cross-validation score: {rf_grid_search2.best_score_}\nTest score: {rf_grid_search2.score(X_test_2, y_test_2)}')

Cross-validation score: 0.7438609815030518
Test score: 0.7618602905236752


In [44]:
y_test_pred_1 = rf_grid_search1.predict(X_test_1)
y_train_pred_1 = rf_grid_search1.predict(X_train_1)
y_test_pred_2 = rf_grid_search2.predict(X_test_2)
y_train_pred_2 = rf_grid_search2.predict(X_train_2)

In [45]:
print('training set classification_report: ')
print(classification_report(y_train_1, y_train_pred_1))
print('\n')
print('test set classification_report: ')
print(classification_report(y_test_1, y_test_pred_1))
print('\n')
print('Training Data confusion_matrix: \n')
print(confusion_matrix(y_train_1, y_train_pred_1))
print('Test Data confusion_matrix: \n')
print(confusion_matrix(y_test_1, y_test_pred_1))

training set classification_report: 
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     28672
           1       0.28      0.60      0.38       493

    accuracy                           0.97     29165
   macro avg       0.64      0.79      0.68     29165
weighted avg       0.98      0.97      0.97     29165



test set classification_report: 
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      7169
           1       0.13      0.32      0.19       123

    accuracy                           0.95      7292
   macro avg       0.56      0.64      0.58      7292
weighted avg       0.97      0.95      0.96      7292



Training Data confusion_matrix: 

[[27923   749]
 [  198   295]]
Test Data confusion_matrix: 

[[6913  256]
 [  84   39]]


In [48]:
print('training set classification_report: ')
print(classification_report(y_train_2, y_train_pred_2))
print('\n')
print('test set classification_report: ')
print(classification_report(y_test_2, y_test_pred_2))
print('\n')
print('Training Data confusion_matrix: \n')
print(confusion_matrix(y_train_2, y_train_pred_2))
print('Test Data confusion_matrix: \n')
print(confusion_matrix(y_test_2, y_test_pred_2))

training set classification_report: 
              precision    recall  f1-score   support

           0       0.99      0.97      0.98     28672
           1       0.22      0.54      0.32       493

    accuracy                           0.96     29165
   macro avg       0.61      0.76      0.65     29165
weighted avg       0.98      0.96      0.97     29165



test set classification_report: 
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      7169
           1       0.09      0.25      0.14       123

    accuracy                           0.95      7292
   macro avg       0.54      0.60      0.55      7292
weighted avg       0.97      0.95      0.96      7292



Training Data confusion_matrix: 

[[27736   936]
 [  225   268]]
Test Data confusion_matrix: 

[[6864  305]
 [  92   31]]


### Logistic Regression

In [49]:
lr_pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', StandardScaler()],
                                
                                ['classifier', LogisticRegression(random_state=11,
                                                                  max_iter=1000)]])

In [50]:
lr_param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
lr_grid_search = GridSearchCV(estimator=lr_pipeline,
                           param_grid=lr_param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

In [51]:
lr_grid_search.fit(X_train_1, y_train_1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=11, shuffle=True),
             estimator=Pipeline(steps=[['smote', SMOTE(random_state=11)],
                                       ['scaler', StandardScaler()],
                                       ['classifier',
                                        LogisticRegression(max_iter=1000,
                                                           random_state=11)]]),
             n_jobs=-1,
             param_grid={'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
             scoring='roc_auc')

In [52]:
print(f'Cross-validation score: {lr_grid_search.best_score_}\nTest score: {lr_grid_search.score(X_test_1, y_test_1)}')

Cross-validation score: 0.7251441988693472
Test score: 0.7214690168941025


In [53]:
y_test_pred_1 = lr_grid_search.predict(X_test_1)
y_train_pred_1 = lr_grid_search.predict(X_train_1)

In [54]:
print('training set classification_report: ')
print(classification_report(y_train_1, y_train_pred_1))
print('\n')
print('test set classification_report: ')
print(classification_report(y_test_1, y_test_pred_1))
print('\n')
print('Training Data confusion_matrix: \n')
print(confusion_matrix(y_train_1, y_train_pred_1))
print('Test Data confusion_matrix: \n')
print(confusion_matrix(y_test_1, y_test_pred_1))

training set classification_report: 
              precision    recall  f1-score   support

           0       0.99      0.73      0.84     28672
           1       0.04      0.60      0.07       493

    accuracy                           0.73     29165
   macro avg       0.51      0.67      0.45     29165
weighted avg       0.97      0.73      0.83     29165



test set classification_report: 
              precision    recall  f1-score   support

           0       0.99      0.72      0.84      7169
           1       0.04      0.64      0.07       123

    accuracy                           0.72      7292
   macro avg       0.51      0.68      0.45      7292
weighted avg       0.98      0.72      0.82      7292



Training Data confusion_matrix: 

[[20921  7751]
 [  197   296]]
Test Data confusion_matrix: 

[[5183 1986]
 [  44   79]]


### XGBoost

In [55]:
xgb_pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', StandardScaler()],
                                
                                ['classifier', XGBClassifier(random_state=11,
                                                                  max_iter=1000)]])

In [56]:
xgb_parameters = {'classifier__gamma': [0.1, 0.2], 
              'classifier__learning_rate': [0.05, 0.1], 
              'classifier__n_estimators': [300, 500], 
              'classifier__max_depth': [10, 15], 
              'classifier__lambda': [0.5, 1], 
              'classifier__min_child_weight': [5, 10]}

In [57]:
xgb_grid_search = GridSearchCV(estimator=xgb_pipeline,
                           param_grid=xgb_parameters,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

In [58]:
xgb_grid_search.fit(X_train_1, y_train_1)

Parameters: { "max_iter" } are not used.



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=11, shuffle=True),
             estimator=Pipeline(steps=[['smote', SMOTE(random_state=11)],
                                       ['scaler', StandardScaler()],
                                       ['classifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False...
                                                      missing=nan,
                                                      m

In [59]:
print(f'Cross-validation score: {xgb_grid_search.best_score_}\nTest score: {xgb_grid_search.score(X_test_1, y_test_1)}')

Cross-validation score: 0.771050782214638
Test score: 0.7997974567554296


In [60]:
y_test_pred_1 = xgb_grid_search.predict(X_test_1)
y_train_pred_1 = xgb_grid_search.predict(X_train_1)

In [61]:
print('training set classification_report: ')
print(classification_report(y_train_1, y_train_pred_1))
print('\n')
print('test set classification_report: ')
print(classification_report(y_test_1, y_test_pred_1))
print('\n')
print('Training Data confusion_matrix: \n')
print(confusion_matrix(y_train_1, y_train_pred_1))
print('Test Data confusion_matrix: \n')
print(confusion_matrix(y_test_1, y_test_pred_1))

training set classification_report: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     28672
           1       0.92      0.44      0.60       493

    accuracy                           0.99     29165
   macro avg       0.96      0.72      0.80     29165
weighted avg       0.99      0.99      0.99     29165



test set classification_report: 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      7169
           1       0.34      0.11      0.17       123

    accuracy                           0.98      7292
   macro avg       0.66      0.56      0.58      7292
weighted avg       0.97      0.98      0.98      7292



Training Data confusion_matrix: 

[[28654    18]
 [  276   217]]
Test Data confusion_matrix: 

[[7142   27]
 [ 109   14]]


In [63]:
xgb_grid_search.best_estimator_

Pipeline(steps=[('smote', SMOTE(random_state=11)), ('scaler', StandardScaler()),
                ['classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=0.1, gpu_id=None,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, lambda=0.5,
                               learning_rate=0.1, max_bin=None,
                               max_cat_threshold=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=10, max_iter=1000,
                               max_leaves=None, min_child_weight=5, missing=nan,
                    

In [90]:
xgb_pipeline_bt = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', StandardScaler()],
                                
                                ['classifier', XGBClassifier(max_depth=12,
                      n_estimators=500,
                      min_child_weight=5, 
                      subsample=0.8, 
                      learning_rate =0.2,  
                      gamma=0.1, 
#                       lambda=0.5,
#                       max_iter=1000,
                      seed=42)]])

In [91]:
xgb_bt = xgb_pipeline_bt.fit(X_train_1, y_train_1)

In [92]:
y_test_pred_1 = xgb_bt.predict(X_test_1)
y_train_pred_1 = xgb_bt.predict(X_train_1)

In [93]:
print('training set classification_report: ')
print(classification_report(y_train_1, y_train_pred_1))
print('\n')
print('test set classification_report: ')
print(classification_report(y_test_1, y_test_pred_1))
print('\n')
print('Training Data confusion_matrix: \n')
print(confusion_matrix(y_train_1, y_train_pred_1))
print('Test Data confusion_matrix: \n')
print(confusion_matrix(y_test_1, y_test_pred_1))
print(f'Test score: {xgb_bt.score(X_test_1, y_test_1)}')

training set classification_report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28672
           1       0.98      0.88      0.92       493

    accuracy                           1.00     29165
   macro avg       0.99      0.94      0.96     29165
weighted avg       1.00      1.00      1.00     29165



test set classification_report: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7169
           1       0.47      0.26      0.34       123

    accuracy                           0.98      7292
   macro avg       0.73      0.63      0.66      7292
weighted avg       0.98      0.98      0.98      7292



Training Data confusion_matrix: 

[[28662    10]
 [   61   432]]
Test Data confusion_matrix: 

[[7133   36]
 [  91   32]]
Test score: 0.9825836533187055


In [97]:
y_test_proba = xgb_bt.predict_proba(X_test_1)
y_train_proba = xgb_bt.predict_proba(X_train_1)

In [98]:
print(roc_auc_score(y_train_1, y_train_proba[:, 1]))
print(roc_auc_score(y_test_1, y_test_proba[:, 1]))

0.9997516500538793
0.806844510068758
