In [8]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from statistics import mean, stdev
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score,auc,roc_curve
import time

In [3]:
data = pd.read_csv('Ftest_66features.csv', header=0, index_col=0)
#data.head()
data.shape

(19028, 66)

In [4]:
data= data.dropna()
data.shape

(19028, 66)

In [5]:
data.head(5)

Unnamed: 0_level_0,portfolio_id,product_term_credit_limit,NP,cash_intent,ALL0000,ALL0216,ALL5020,ALL6120,ALL6210,ALL6220,...,RTR3348,TBCA2263,TBCA2264,TBCA2276,TBCA2277,TBCA2278,TBCA2601,TBCA2606,TBCC1203,TBCC3203
record_nb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2019-08-01,500,0,0,13,7,49878,1,1,1,...,0.0,6.0,4.0,5.0,3.0,4.0,54.0,47.0,330.0,1448.0
2,2019-08-01,500,0,0,21,11,9146,1,1,1,...,0.0,0.0,2.0,0.0,1.0,97.0,-335.0,-60.0,-681.0,-655.0
3,2019-08-01,500,0,0,32,26,26012,1,1,1,...,0.0,2.0,5.0,2.0,0.0,1.0,215.0,406.0,1199.0,292.0
4,2019-08-01,500,0,0,39,0,999999997,0,994,400,...,0.0,97.0,97.0,97.0,97.0,97.0,999999997.0,999999997.0,999999997.0,999999997.0
5,2019-08-01,500,0,1,7,1,568,1,1,1,...,98.0,3.0,97.0,3.0,97.0,97.0,62.0,62.0,143.0,999999996.0


### Imbalanced f-test dataset

In [6]:
X = data.drop(['NP', "portfolio_id"], axis = 1)
#X = data.drop(['NP'], axis = 1)
Y = data["NP"]

In [9]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('XGBoost', XGBClassifier(eval_metric='mlogloss')))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

# Create StratifiedKFold object.    
tic = time.perf_counter()
for name, model in models:
    skfold = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring=scoring)
    f1_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='f1')
    recall_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='recall')
    precision_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='precision')
    roc_auc_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='roc_auc')
    MLA_compare.loc[row_index,'MLA used'] = name
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(mean(cv_results), 4)
    MLA_compare.loc[row_index, 'Precision'] = round(mean(precision_results),4)
    MLA_compare.loc[row_index, 'Recall'] = round(mean(recall_results),4)
    MLA_compare.loc[row_index, 'f1'] = mean(f1_results)
    MLA_compare.loc[row_index, 'roc_auc'] = mean(roc_auc_results)
    
    results.append(cv_results)
    names.append(name)
    # Print the output.
    print('List of possible accuracies for {0} is: {1}'.format(name, cv_results))
    print('List of possible Precision for {0} is: {1}'.format(name, precision_results))
    print('List of possible Recall for {0} is: {1}'.format(name, recall_results))
    print('List of possible F1 score for {0} is: {1}'.format(name, f1_results))
    print('List of possible ROC_AUC for {0} is: {1}'.format(name, roc_auc_results))
    msg = "%s: %f (%f)" % (name, mean(cv_results), cv_results.std())
    print(msg)
    toc = time.perf_counter()
    secs = toc - tic
    print("---Classifier %s use %0.4f seconds ---" %(name, secs))
    row_index+=1
    
MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)    
MLA_compare

List of possible accuracies for Logistic Regression is: [0.74461377 0.75801366 0.75118234 0.75978975 0.75795007]
List of possible Precision for Logistic Regression is: [0.03964758 0.0443038  0.03115265 0.05047319 0.0384    ]
List of possible Recall for Logistic Regression is: [0.07826087 0.08115942 0.05797101 0.09302326 0.06976744]
List of possible F1 score for Logistic Regression is: [0.05263158 0.05731832 0.04052685 0.06543967 0.0495356 ]
List of possible ROC_AUC for Logistic Regression is: [0.4800263  0.48309528 0.46931565 0.49958046 0.47635488]
Logistic Regression: 0.754310 (0.005671)
---Classifier Logistic Regression use 1.3045 seconds ---






List of possible accuracies for XGBoost is: [0.90436153 0.90803994 0.90777719 0.9064389  0.90696452]
List of possible Precision for XGBoost is: [0.37333333 0.46753247 0.46666667 0.40322581 0.44565217]
List of possible Recall for XGBoost is: [0.08115942 0.10434783 0.12173913 0.07267442 0.11918605]
List of possible F1 score for XGBoost is: [0.13333333 0.17061611 0.19310345 0.12315271 0.18807339]
List of possible ROC_AUC for XGBoost is: [0.73709743 0.75799781 0.78123019 0.75406481 0.76509385]
XGBoost: 0.906716 (0.001309)
---Classifier XGBoost use 21.1017 seconds ---


Unnamed: 0,MLA used,Test Accuracy,Precision,Recall,f1,roc_auc
1,XGBoost,0.9067,0.4313,0.0998,0.161656,0.759097
0,Logistic Regression,0.7543,0.0408,0.076,0.05309,0.481675


## Balanced dataset -- Down Sampling the Majority Class

In [10]:
from imblearn.under_sampling import RandomUnderSampler

X = data.drop(['NP', "portfolio_id"], axis = 1)
y = data["NP"]
rus = RandomUnderSampler(replacement=False)
X_subsample, y_subsample = rus.fit_resample(X, y)
print(X.shape)
print(X_subsample.shape)

(19028, 64)
(3446, 64)


In [13]:
#X_subsample.head()

In [14]:
X = X_subsample
#X = data.drop(['NP'], axis = 1)

Y = y_subsample

In [15]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('XGBoost', XGBClassifier(eval_metric='mlogloss')))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

# Create StratifiedKFold object.    
tic = time.perf_counter()
for name, model in models:
    skfold = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring=scoring)
    f1_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='f1')
    recall_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='recall')
    precision_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='precision')
    roc_auc_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='roc_auc')
    MLA_compare.loc[row_index,'MLA used'] = name
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(mean(cv_results), 4)
    MLA_compare.loc[row_index, 'Precision'] = round(mean(precision_results),4)
    MLA_compare.loc[row_index, 'Recall'] = round(mean(recall_results),4)
    MLA_compare.loc[row_index, 'f1'] = mean(f1_results)
    MLA_compare.loc[row_index, 'roc_auc'] = mean(roc_auc_results)
    
    results.append(cv_results)
    names.append(name)
    # Print the output.
    print('List of possible accuracies for {0} is: {1}'.format(name, cv_results))
    print('List of possible Precision for {0} is: {1}'.format(name, precision_results))
    print('List of possible Recall for {0} is: {1}'.format(name, recall_results))
    print('List of possible F1 score for {0} is: {1}'.format(name, f1_results))
    print('List of possible ROC_AUC for {0} is: {1}'.format(name, roc_auc_results))
    msg = "%s: %f (%f)" % (name, mean(cv_results), cv_results.std())
    print(msg)
    toc = time.perf_counter()
    secs = toc - tic
    print("---Classifier %s use %0.4f seconds ---" %(name, secs))
    row_index+=1
    
MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)    
MLA_compare

List of possible accuracies for Logistic Regression is: [0.61304348 0.60232221 0.6095791  0.57184325 0.55732946]
List of possible Precision for Logistic Regression is: [0.59241706 0.57918552 0.58660508 0.55208333 0.54273504]
List of possible Recall for Logistic Regression is: [0.72463768 0.74418605 0.73837209 0.76811594 0.73623188]
List of possible F1 score for Logistic Regression is: [0.65189048 0.65139949 0.65379665 0.64242424 0.62484625]
List of possible ROC_AUC for Logistic Regression is: [0.67020374 0.67546764 0.70203067 0.66702056 0.65642484]
Logistic Regression: 0.590823 (0.022180)
---Classifier Logistic Regression use 0.3331 seconds ---






List of possible accuracies for XGBoost is: [0.66666667 0.7256894  0.69085631 0.67634253 0.6777939 ]
List of possible Precision for XGBoost is: [0.67912773 0.72997033 0.69208211 0.67836257 0.68923077]
List of possible Recall for XGBoost is: [0.63188406 0.71511628 0.68604651 0.67246377 0.64927536]
List of possible F1 score for XGBoost is: [0.65465465 0.72246696 0.68905109 0.67540029 0.66865672]
List of possible ROC_AUC for XGBoost is: [0.74253308 0.77202983 0.76059151 0.73687226 0.74673913]
XGBoost: 0.687470 (0.020603)
---Classifier XGBoost use 6.4993 seconds ---


Unnamed: 0,MLA used,Test Accuracy,Precision,Recall,f1,roc_auc
1,XGBoost,0.6875,0.6938,0.671,0.682046,0.751753
0,Logistic Regression,0.5908,0.5706,0.7423,0.644871,0.674229


## Balanced dataset -- Up Sampling the Minority Class

### Logistic Regression for up sampling

In [16]:
from imblearn.over_sampling import RandomOverSampler

X = data.drop(['NP', "portfolio_id"], axis = 1)
y = data["NP"]
ros = RandomOverSampler()
X_oversample, y_oversample = ros.fit_resample(X, y)
print(X.shape)
print(X_oversample.shape)

(19028, 64)
(34610, 64)


In [17]:
X = X_oversample
#X = data.drop(['NP'], axis = 1)

Y = y_oversample

In [18]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('XGBoost', XGBClassifier(eval_metric='mlogloss')))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

# Create StratifiedKFold object.    
tic = time.perf_counter()
for name, model in models:
    skfold = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring=scoring)
    f1_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='f1')
    recall_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='recall')
    precision_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='precision')
    roc_auc_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='roc_auc')
    MLA_compare.loc[row_index,'MLA used'] = name
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(mean(cv_results), 4)
    MLA_compare.loc[row_index, 'Precision'] = round(mean(precision_results),4)
    MLA_compare.loc[row_index, 'Recall'] = round(mean(recall_results),4)
    MLA_compare.loc[row_index, 'f1'] = mean(f1_results)
    MLA_compare.loc[row_index, 'roc_auc'] = mean(roc_auc_results)
    
    results.append(cv_results)
    names.append(name)
    # Print the output.
    print('List of possible accuracies for {0} is: {1}'.format(name, cv_results))
    print('List of possible Precision for {0} is: {1}'.format(name, precision_results))
    print('List of possible Recall for {0} is: {1}'.format(name, recall_results))
    print('List of possible F1 score for {0} is: {1}'.format(name, f1_results))
    print('List of possible ROC_AUC for {0} is: {1}'.format(name, roc_auc_results))
    msg = "%s: %f (%f)" % (name, mean(cv_results), cv_results.std())
    print(msg)
    toc = time.perf_counter()
    secs = toc - tic
    print("---Classifier %s use %0.4f seconds ---" %(name, secs))
    row_index+=1
    
MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)    
MLA_compare

List of possible accuracies for Logistic Regression is: [0.61412886 0.60849465 0.60950592 0.6148512  0.61932967]
List of possible Precision for Logistic Regression is: [0.5955029  0.59228312 0.59101825 0.59826947 0.60107685]
List of possible Recall for Logistic Regression is: [0.71164403 0.69633054 0.71106617 0.69921988 0.7096215 ]
List of possible F1 score for Logistic Regression is: [0.64841385 0.64010624 0.6455082  0.64481748 0.65085464]
List of possible ROC_AUC for Logistic Regression is: [0.68462091 0.67221826 0.68437051 0.68258631 0.68654361]
Logistic Regression: 0.613262 (0.003922)
---Classifier Logistic Regression use 2.0379 seconds ---






List of possible accuracies for XGBoost is: [0.94654724 0.9419243  0.94206877 0.94712511 0.94669171]
List of possible Precision for XGBoost is: [0.90574954 0.89924302 0.90031397 0.90820364 0.90598739]
List of possible Recall for XGBoost is: [0.99682173 0.99537706 0.99422132 0.99479919 0.99682173]
List of possible F1 score for XGBoost is: [0.94910591 0.94487109 0.94494027 0.94953116 0.94923648]
List of possible ROC_AUC for XGBoost is: [0.9851144  0.9835956  0.98562272 0.98415247 0.98346294]
XGBoost: 0.944871 (0.002355)
---Classifier XGBoost use 32.6279 seconds ---


Unnamed: 0,MLA used,Test Accuracy,Precision,Recall,f1,roc_auc
1,XGBoost,0.9449,0.9039,0.9956,0.947537,0.98439
0,Logistic Regression,0.6133,0.5956,0.7056,0.64594,0.682068


### SMOTE for imbalanced dataset

In [19]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [20]:
from sklearn.datasets import fetch_openml

data = pd.read_csv('Ftest_66features.csv', header=0, index_col=0)
data = data.dropna()

In [25]:
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from sklearn.model_selection import cross_validate
import numpy as np

X = data.drop(['NP', "portfolio_id"], axis = 1)
y = data["NP"]
# for imblearn we should have these classes
y = (y.astype(np.int) + 1) // 2

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = (y.astype(np.int) + 1) // 2


In [26]:
X_synthetic_sample, y_synthetic_sample = smote.fit_resample(X, y)

X_synthetic_sample.head()

Unnamed: 0,product_term_credit_limit,cash_intent,ALL0000,ALL0216,ALL5020,ALL6120,ALL6210,ALL6220,ALL7140,ALL7334,...,RTR3348,TBCA2263,TBCA2264,TBCA2276,TBCA2277,TBCA2278,TBCA2601,TBCA2606,TBCC1203,TBCC3203
0,500,0,13,7,49878,1,1,1,76,88,...,0.0,6.0,4.0,5.0,3.0,4.0,54.0,47.0,330.0,1448.0
1,500,0,21,11,9146,1,1,1,32,100,...,0.0,0.0,2.0,0.0,1.0,97.0,-335.0,-60.0,-681.0,-655.0
2,500,0,32,26,26012,1,1,1,60,100,...,0.0,2.0,5.0,2.0,0.0,1.0,215.0,406.0,1199.0,292.0
3,500,0,39,0,999999997,0,994,400,997,0,...,0.0,97.0,97.0,97.0,97.0,97.0,999999997.0,999999997.0,999999997.0,999999997.0
4,500,1,7,1,568,1,1,1,95,100,...,98.0,3.0,97.0,3.0,97.0,97.0,62.0,62.0,143.0,999999996.0


In [27]:
X = X_synthetic_sample
#X = data.drop(['NP'], axis = 1)

Y = y_synthetic_sample

In [28]:
MLA_columns = []
MLA_compare = pd.DataFrame(columns = MLA_columns)
row_index = 0
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('XGBoost', XGBClassifier(eval_metric='mlogloss')))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

# Create StratifiedKFold object.    
tic = time.perf_counter()
for name, model in models:
    skfold = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring=scoring)
    f1_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='f1')
    recall_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='recall')
    precision_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='precision')
    roc_auc_results = model_selection.cross_val_score(model, X, Y, cv=skfold, scoring='roc_auc')
    MLA_compare.loc[row_index,'MLA used'] = name
    MLA_compare.loc[row_index, 'Test Accuracy'] = round(mean(cv_results), 4)
    MLA_compare.loc[row_index, 'Precision'] = round(mean(precision_results),4)
    MLA_compare.loc[row_index, 'Recall'] = round(mean(recall_results),4)
    MLA_compare.loc[row_index, 'f1'] = mean(f1_results)
    MLA_compare.loc[row_index, 'roc_auc'] = mean(roc_auc_results)
    
    results.append(cv_results)
    names.append(name)
    # Print the output.
    print('List of possible accuracies for {0} is: {1}'.format(name, cv_results))
    print('List of possible Precision for {0} is: {1}'.format(name, precision_results))
    print('List of possible Recall for {0} is: {1}'.format(name, recall_results))
    print('List of possible F1 score for {0} is: {1}'.format(name, f1_results))
    print('List of possible ROC_AUC for {0} is: {1}'.format(name, roc_auc_results))
    msg = "%s: %f (%f)" % (name, mean(cv_results), cv_results.std())
    print(msg)
    toc = time.perf_counter()
    secs = toc - tic
    print("---Classifier %s use %0.4f seconds ---" %(name, secs))
    row_index+=1
    
MLA_compare.sort_values(by = ['Test Accuracy'], ascending = False, inplace = True)    
MLA_compare

List of possible accuracies for Logistic Regression is: [0.60892806 0.615718   0.59722624 0.60907252 0.60676105]
List of possible Precision for Logistic Regression is: [0.59071222 0.59666908 0.57904628 0.59416313 0.59276927]
List of possible Recall for Logistic Regression is: [0.70933256 0.71424444 0.7122219  0.68824039 0.68217278]
List of possible F1 score for Logistic Regression is: [0.64461074 0.65018411 0.63876652 0.637751   0.63433638]
List of possible ROC_AUC for Logistic Regression is: [0.68104284 0.68362613 0.67384154 0.67657906 0.67474244]
Logistic Regression: 0.607541 (0.005969)
---Classifier Logistic Regression use 2.3928 seconds ---






List of possible accuracies for XGBoost is: [0.9432245  0.93845709 0.94033516 0.93946836 0.94654724]
List of possible Precision for XGBoost is: [0.95791045 0.95420533 0.95574163 0.95538922 0.96065574]
List of possible Recall for XGBoost is: [0.92718867 0.92112106 0.92343253 0.92198786 0.93123375]
List of possible F1 score for XGBoost is: [0.94229922 0.93737136 0.93930933 0.93839141 0.94571596]
List of possible ROC_AUC for XGBoost is: [0.97928521 0.97801974 0.97838001 0.97810268 0.97992131]
XGBoost: 0.941606 (0.002937)
---Classifier XGBoost use 51.4445 seconds ---


Unnamed: 0,MLA used,Test Accuracy,Precision,Recall,f1,roc_auc
1,XGBoost,0.9416,0.9568,0.925,0.940617,0.978742
0,Logistic Regression,0.6075,0.5907,0.7012,0.64113,0.677966
