In [0]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [0]:
# load data
import pandas as pd
dataset = pd.read_csv('bank-additional-full.csv', sep = ';')

In [0]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
dataset['job']      = labelencoder_X.fit_transform(dataset['job']) 
dataset['marital']  = labelencoder_X.fit_transform(dataset['marital']) 
dataset['education']= labelencoder_X.fit_transform(dataset['education']) 
dataset['default']  = labelencoder_X.fit_transform(dataset['default']) 
dataset['housing']  = labelencoder_X.fit_transform(dataset['housing']) 
dataset['loan']     = labelencoder_X.fit_transform(dataset['loan']) 

#feature engineer the categorical columns to create label encoders using one hot encoding

In [0]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
dataset['contact']     = labelencoder_X.fit_transform(dataset['contact']) 
dataset['month']       = labelencoder_X.fit_transform(dataset['month']) 
dataset['day_of_week'] = labelencoder_X.fit_transform(dataset['day_of_week'])
dataset['poutcome'] = labelencoder_X.fit_transform(dataset['poutcome'])
dataset['y'] = labelencoder_X.fit_transform(dataset['y'])

In [53]:
dataset.corr()['y']

age               0.030399
job               0.025122
marital           0.046203
education         0.057799
default          -0.099352
housing           0.011552
loan             -0.004909
contact          -0.144773
month            -0.006065
day_of_week       0.015967
duration          0.405274
campaign         -0.066357
pdays            -0.324914
previous          0.230181
poutcome          0.129789
emp.var.rate     -0.298334
cons.price.idx   -0.136211
cons.conf.idx     0.054878
euribor3m        -0.307771
nr.employed      -0.354678
y                 1.000000
Name: y, dtype: float64

In [51]:
X = dataset.drop(['y','month'], 1)
Y = dataset['y']
X.shape


(41188, 19)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 101)

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [55]:
X_train.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
19010,58,0,1,3,0,2,0,0,3,553,1,999,0,1,1.4,93.444,-36.1,4.968,5228.1
23466,53,9,1,5,1,2,2,0,4,276,2,999,0,1,1.4,93.444,-36.1,4.964,5228.1
27578,59,9,2,5,0,2,0,0,0,246,2,999,0,1,-0.1,93.2,-42.0,4.021,5195.8
32365,46,9,1,5,0,0,0,0,0,124,2,999,0,1,-1.8,92.893,-46.2,1.313,5099.1
25394,39,10,1,2,1,2,0,0,3,467,1,999,0,1,-0.1,93.2,-42.0,4.153,5195.8


In [56]:
y_train.head()

19010    1
23466    0
27578    0
32365    0
25394    0
Name: y, dtype: int64

In [0]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression() 
logmodel.fit(X_train,y_train)
logpred = logmodel.predict(X_test)

import warnings
warnings.filterwarnings('ignore')




In [58]:
import warnings
from sklearn.metrics import confusion_matrix, accuracy_score
warnings.filterwarnings('ignore')
print(confusion_matrix(y_test, logpred))
print(accuracy_score(y_test, logpred)*100)
LOGCV = (cross_val_score(logmodel, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())
print(LOGCV)


[[11785   305]
 [  912   591]]
91.04686235562421
0.9101279081372688


In [60]:
#Calculating “Average profit per customer” for reaching out to all the customers in training data
print("The number of customers to be reached out to are:",len(y_train))
print("Number of responders are:",y_train.value_counts()[1],"\n")
print("Cost associate for reaching out to customers: $",len(y_train)*10)
print("Revenue: $",y_train.value_counts()[1]*50)
print("Profit (revnue - cost):$",y_train.value_counts()[1]*50-len(y_train)*10)

#without any model,company would be at a loss

The number of customers to be reached out to are: 27595
Number of responders are: 3137 

Cost associate for reaching out to customers: $ 275950
Revenue: $ 156850
Profit (revnue - cost):$ -119100


# if we blindly reach out to all customers, we will be at loss. So let's create a cost function

In [0]:
from sklearn.metrics import confusion_matrix
import numpy as np
def cost_funciton(Y_Actual, Y_Predicted, cost_per_individual,
                  revenue_per_individual, low_threshold, high_threshold):
    cost_function_output = pd.DataFrame()
    for threshold in np.arange(low_threshold, high_threshold, 0.01):
        confusion_out = confusion_matrix(Y_Actual, np.where(Y_Predicted>threshold,1,0))
        target_populaion=round((confusion_out[1][1]+confusion_out[0][1])/len(Y_Actual),4)
        responders_in_target_populaiton=round(confusion_out[1][1]/(len(Y_Actual)),4)
        cost=(confusion_out[1][1]+confusion_out[0][1])*cost_per_individual
        revenue=confusion_out[1][1]*revenue_per_individual
        profit=revenue-cost
        ROI=round(profit/cost*100,0)
        cost_function_output=cost_function_output.append([[threshold, 
                                                           responders_in_target_populaiton*100, 
                                                           target_populaion*100, 
                                                           revenue, 
                                                           cost,  
                                                           profit, 
                                                           ROI]])
    return cost_function_output

#let's assume our budget to 25k SGD. and we want to target as many people as possible

In [63]:
#class wise accuracy
target_names = ['class 0', 'class 1']
print(classification_report(y_test, logpred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.93      0.97      0.95     12090
     class 1       0.66      0.39      0.49      1503

    accuracy                           0.91     13593
   macro avg       0.79      0.68      0.72     13593
weighted avg       0.90      0.91      0.90     13593



In [62]:

Y_pred=logreg.predict_proba(X_train)[:,1] 
cost_function_output=cost_funciton(y_train, Y_pred , 10, 50, 0.5,0.995)
my_columns = ["Probability of event happening",  "% Responders_in_target_populaiton", 
              "% Target_populaion", "Revenue", "Cost",  "Profit","ROI"]
cost_function_output.columns = my_columns
cost_function_output

Unnamed: 0,Probability of event happening,% Responders_in_target_populaiton,% Target_populaion,Revenue,Cost,Profit,ROI
0,0.5,4.63,6.91,63900,19080,44820,235.0
0,0.51,4.53,6.72,62500,18550,43950,237.0
0,0.52,4.45,6.56,61450,18110,43340,239.0
0,0.53,4.37,6.41,60350,17680,42670,241.0
0,0.54,4.21,6.15,58050,16960,41090,242.0
0,0.55,4.12,5.97,56900,16480,40420,245.0
0,0.56,4.05,5.83,55850,16080,39770,247.0
0,0.57,3.94,5.64,54400,15550,38850,250.0
0,0.58,3.82,5.46,52650,15070,37580,249.0
0,0.59,3.73,5.3,51400,14630,36770,251.0


In [43]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 500, criterion='entropy')#criterion = entopy,gini
rfc.fit(X_train, y_train)
rfcpred = rfc.predict_prob(X_test)

print(confusion_matrix(y_test, rfcpred ))
print(accuracy_score(y_test, rfcpred)*100)
RFCCV = (cross_val_score(rfc, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())

[[11664   426]
 [  736   767]]
91.45148238063709


In [64]:
#class wise accuracy
target_names = ['class 0', 'class 1']
print(classification_report(y_test, rfcpred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.94      0.96      0.95     12090
     class 1       0.64      0.51      0.57      1503

    accuracy                           0.91     13593
   macro avg       0.79      0.74      0.76     13593
weighted avg       0.91      0.91      0.91     13593



In [65]:
# rfc = RandomForestClassifier(n_estimators = 500, criterion='entropy')#criterion = entopy,gini
# rfc.fit(X_train, y_train)
Y_pred=rfc.predict_proba(X_train)[:,1] 
cost_function_output=cost_funciton(y_train, Y_pred , 10, 50, 0.5,0.995)
my_columns = ["Probability of event happening",  "% Responders_in_target_populaiton", 
              "% Target_populaion", "Revenue", "Cost",  "Profit","ROI"]
cost_function_output.columns = my_columns
cost_function_output

Unnamed: 0,Probability of event happening,% Responders_in_target_populaiton,% Target_populaion,Revenue,Cost,Profit,ROI
0,0.5,11.37,11.37,156850,31370,125480,400.0
0,0.51,11.37,11.37,156850,31370,125480,400.0
0,0.52,11.37,11.37,156850,31370,125480,400.0
0,0.53,11.37,11.37,156850,31370,125480,400.0
0,0.54,11.37,11.37,156850,31370,125480,400.0
0,0.55,11.37,11.37,156850,31370,125480,400.0
0,0.56,11.37,11.37,156850,31370,125480,400.0
0,0.57,11.37,11.37,156850,31370,125480,400.0
0,0.58,11.36,11.36,156750,31350,125400,400.0
0,0.59,11.36,11.36,156750,31350,125400,400.0


In [45]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgbprd = xgb.predict(X_test)

print(confusion_matrix(y_test, xgbprd ))
print(accuracy_score(y_test, xgbprd)*100)
XGB = (cross_val_score(estimator = xgb, X = X_train, y = y_train, cv = 10).mean())

[[11690   400]
 [  754   749]]
91.51033620245714


In [66]:
#class wise accuracy
target_names = ['class 0', 'class 1']
print(classification_report(y_test, xgbprd, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.94      0.97      0.95     12090
     class 1       0.65      0.50      0.56      1503

    accuracy                           0.92     13593
   macro avg       0.80      0.73      0.76     13593
weighted avg       0.91      0.92      0.91     13593



In [67]:
Y_pred=xgb.predict_proba(X_train)[:,1] 
cost_function_output=cost_funciton(y_train, Y_pred , 10, 50, 0.5,0.995)
my_columns = ["Probability of event happening",  "% Responders_in_target_populaiton", 
              "% Target_populaion", "Revenue", "Cost",  "Profit","ROI"]
cost_function_output.columns = my_columns
cost_function_output

Unnamed: 0,Probability of event happening,% Responders_in_target_populaiton,% Target_populaion,Revenue,Cost,Profit,ROI
0,0.5,6.07,8.65,83750,23880,59870,251.0
0,0.51,5.87,8.33,81050,22990,58060,253.0
0,0.52,5.7,7.98,78650,22020,56630,257.0
0,0.53,5.52,7.64,76150,21090,55060,261.0
0,0.54,5.31,7.3,73300,20140,53160,264.0
0,0.55,5.01,6.79,69150,18750,50400,269.0
0,0.56,4.79,6.4,66100,17650,48450,275.0
0,0.57,4.55,6.02,62800,16600,46200,278.0
0,0.58,4.36,5.69,60200,15690,44510,284.0
0,0.59,4.12,5.3,56850,14630,42220,289.0


In [68]:
from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier()
gbk.fit(X_train, y_train)
gbkpred = gbk.predict(X_test)
print(confusion_matrix(y_test, gbkpred ))
print(accuracy_score(y_test, gbkpred)*100)
GBKCV = (cross_val_score(gbk, X_train, y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy').mean())

[[11652   438]
 [  703   800]]
91.60597366291474


In [69]:
#class wise accuracy
target_names = ['class 0', 'class 1']
print(classification_report(y_test, gbkpred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.94      0.96      0.95     12090
     class 1       0.65      0.53      0.58      1503

    accuracy                           0.92     13593
   macro avg       0.79      0.75      0.77     13593
weighted avg       0.91      0.92      0.91     13593



In [70]:
Y_pred=gbk.predict_proba(X_train)[:,1] 
cost_function_output=cost_funciton(y_train, Y_pred , 10, 50, 0.5,0.995)
my_columns = ["Probability of event happening",  "% Responders_in_target_populaiton", 
              "% Target_populaion", "Revenue", "Cost",  "Profit","ROI"]
cost_function_output.columns = my_columns
cost_function_output

Unnamed: 0,Probability of event happening,% Responders_in_target_populaiton,% Target_populaion,Revenue,Cost,Profit,ROI
0,0.5,6.36,9.12,87700,25180,62520,248.0
0,0.51,6.22,8.85,85800,24430,61370,251.0
0,0.52,6.06,8.58,83550,23670,59880,253.0
0,0.53,5.91,8.29,81550,22870,58680,257.0
0,0.54,5.74,7.98,79150,22010,57140,260.0
0,0.55,5.58,7.67,77000,21160,55840,264.0
0,0.56,5.42,7.37,74800,20350,54450,268.0
0,0.57,5.23,7.03,72100,19390,52710,272.0
0,0.58,5.01,6.64,69100,18310,50790,277.0
0,0.59,4.77,6.21,65850,17140,48710,284.0


In [71]:
models = pd.DataFrame({
                'Models': ['Random Forest Classifier','Logistic Model', 'XGBoost', 'Gradient Boosting'],
                'Score':  [RFCCV, LOGCV, XGB, GBKCV]})

models.sort_values(by='Score', ascending=False)

Unnamed: 0,Models,Score
3,Gradient Boosting,0.916651
2,XGBoost,0.916325
0,Random Forest Classifier,0.914477
1,Logistic Model,0.910128


#Although Random forest yields highest profit, remember our budget is 25k SGD and hence GradientBoosting is the best model to deploy