In [12]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#Models
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

#metrics
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score

plt.rcParams['figure.figsize'] = (10,7)

In [13]:
data = pd.read_csv('/Users/vadimsmirnov/Desktop/Project_ML/bank-additional-full.csv', sep = ';')

In [20]:
data[data.duplicated].shape

(12, 21)

In [21]:
data = data.drop(columns = 'duration')

In [27]:
data.pdays = np.where(data.pdays == 999, 'A', 'B')
data.pdays = data.pdays.astype('object')

In [28]:
data = data.rename(columns = {'y':'target'})
map_values = {'yes':1,'no':0}
data.target = data.target.replace(map_values)

In [30]:
data.shape[0]

41188

In [31]:
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,target
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,A,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,A,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,A,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,A,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,A,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,1,A,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,1,A,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,2,A,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,1,A,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1


In [33]:
#Split the data in Train and Test (HoldOut)

HoldOut, test_data= train_test_split(data, test_size = 0.10,random_state = 123)

#
train_data, validation_data = train_test_split(HoldOut, test_size = 0.15, random_state = 123)

train_data = train_data.reset_index(drop = True)

validation_data = validation_data.reset_index(drop = True)

test_data = test_data.reset_index(drop = True)



In [34]:
train_data.target.value_counts(normalize = True)


target
0    0.887172
1    0.112828
Name: proportion, dtype: float64

In [35]:
validation_data.target.value_counts(normalize = True)

target
0    0.885452
1    0.114548
Name: proportion, dtype: float64

In [36]:
test_data.target.value_counts(normalize = True)

target
0    0.891236
1    0.108764
Name: proportion, dtype: float64

In [70]:
print('Sample size for training dataset: {0}'.format(train_data.shape[0]))
print('Sample size for validation dataset: {0}'.format(validation_data.shape[0]))
print('Sample size for test dataset: {0}'.format(test_data.shape[0]))

Sample size for training dataset: 31508
Sample size for validation dataset: 5561
Sample size for test dataset: 4119


In [39]:
train_data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed', 'target'],
      dtype='object')

In [43]:
cat = data.drop(columns = 'target').select_dtypes('object').columns
num = data.drop(columns = 'target').select_dtypes('number').columns

In [44]:
train_data_categorical = train_data.loc[:,cat]
train_data_numeric = train_data.loc[:,num]

In [46]:
train_data_numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31508 entries, 0 to 31507
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             31508 non-null  int64  
 1   campaign        31508 non-null  int64  
 2   previous        31508 non-null  int64  
 3   emp.var.rate    31508 non-null  float64
 4   cons.price.idx  31508 non-null  float64
 5   cons.conf.idx   31508 non-null  float64
 6   euribor3m       31508 non-null  float64
 7   nr.employed     31508 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 1.9 MB


In [48]:
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False, dtype = int)

In [49]:
ohe.fit(train_data_categorical)



In [62]:
ohe_train = ohe.transform(train_data_categorical)
ohe_hot_train = pd.DataFrame(ohe_train,columns = ohe.get_feature_names_out(cat))
train_data_encoded = pd.concat([train_data_numeric,ohe_hot_train],axis = 1)

In [72]:
validation_data_categorical = validation_data.loc[:,cat]
validation_data_numeric = validation_data.loc[:,num]

In [73]:
ohe_validation = ohe.transform(validation_data_categorical)
one_hot_validation = pd.DataFrame(ohe_validation,columns = ohe.get_feature_names_out(cat))
validation_data_encoded = pd.concat([validation_data_numeric,one_hot_validation],axis=1)

In [76]:
test_data_categorical = test_data.loc[:,cat]
test_data_numeric = test_data.loc[:,num]

In [77]:
ohe_test = ohe.transform(test_data_categorical)Vicky (Xiaoqi) Ma 
one_hot_test = pd.DataFrame(ohe_test,columns = ohe.get_feature_names_out(cat))
test_data_encoded = pd.concat([test_data_numeric,one_hot_test],axis=1)

In [80]:
pd.concat([train_data_encoded,train_data.target],axis = 1).to_csv('/Users/vadimsmirnov/Desktop/Project_ML/train_data_encoded.csv', index = False)
pd.concat([validation_data_encoded,validation_data.target],axis = 1).to_csv('/Users/vadimsmirnov/Desktop/Project_ML/validation_data_encoded', index = False)
pd.concat([test_data_encoded,test_data.target],axis = 1).to_csv('/Users/vadimsmirnov/Desktop/Project_ML/test_data_encoded.csv', index = False)


In [82]:
scaler = StandardScaler()
scaler.fit(train_data_encoded)

X_train_scaled = scaler.transform(train_data_encoded)
X_validation_scaled = scaler.transform(validation_data_encoded)

In [83]:
y_train = train_data.target
y_validation = validation_data.target

In [110]:
#Modelling
#Logistic Regression

In [94]:
lr = LogisticRegression()
lr.fit(X_train_scaled,y_train)

In [95]:
y_predict_train = lr.predict_proba(X_train_scaled)
y_predict_valid = lr.predict_proba(X_validation_scaled)

In [101]:
from sklearn import metrics

In [108]:
# fpr, tpr, _ = metrics.roc_curve(y_validation,  y_predict_valid[:,1])

# #create ROC curve
# plt.plot(fpr,tpr)
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()

In [109]:
train_score_lr = roc_auc_score(y_train,y_predict_train[:,1])
valid_score_lr = roc_auc_score(y_validation,y_predict_valid[:,1])
print("Training ROC-AUC score of baseline Logistic Regression model: ", train_score_lr)
print("Validation ROC-AUC score of baseline Logistic Regression model: ", valid_score_lr)

Training ROC-AUC score of baseline Logistic Regression model:  0.7951981281821109
Validation ROC-AUC score of baseline Logistic Regression model:  0.7966269398467378


In [111]:
#Random Forest

In [112]:
rf = RandomForestClassifier(random_state = 1)
rf.fit(X_train_scaled,y_train)

In [113]:
y_predict_train = rf.predict_proba(X_train_scaled)
y_predict_valid = rf.predict_proba(X_validation_scaled)


In [115]:
train_score_rf = roc_auc_score(y_train,y_predict_train[:,1])
valid_score_rf = roc_auc_score(y_validation,y_predict_valid[:,1])
print("Training ROC-AUC score of Random Forest model: ", train_score_rf)
print("Validation ROC-AUC score of Random Forest model: ", valid_score_rf)

Training ROC-AUC score of Random Forest model:  0.9998348896175582
Validation ROC-AUC score of Random Forest model:  0.7879458507142156


In [116]:
#XGBoost

In [117]:
xgboost = xgb.XGBClassifier(random_state = 1,verbosity = 0)

In [118]:
xgboost.fit(X_train_scaled,y_train)

In [119]:
y_predict_train = xgboost.predict_proba(X_train_scaled)
y_predict_valid = xgboost.predict_proba(X_validation_scaled)

In [121]:
train_score_xgb = roc_auc_score(y_train,y_predict_train[:,1])
valid_score_xgb = roc_auc_score(y_validation,y_predict_valid[:,1])
print("Training ROC-AUC score of XGBoost model: ", train_score_xgb)
print("Validation ROC-AUC score of XGBoost model: ", valid_score_xgb)

Training ROC-AUC score of XGBoost model:  0.9124689811101948
Validation ROC-AUC score of XGBoost model:  0.7936861647114635


In [122]:
#linear SVM
lsvm = SGDClassifier(random_state = 42)
clsvm = CalibratedClassifierCV(lsvm)
clsvm.fit(X_train_scaled,y_train)

In [124]:
y_predict_train = clsvm.predict_proba(X_train_scaled)
y_predict_valid = clsvm.predict_proba(X_validation_scaled)

In [125]:
train_score_lsvc = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_lsvc = roc_auc_score(y_validation, y_predict_valid[:,1])
print("Training ROC-AUC score of Linear SVM model: ", train_score_lsvc)
print("Validation ROC-AUC score of Linear SVM model: ", valid_score_lsvc)

Training ROC-AUC score of Linear SVM model:  0.7481304840458791
Validation ROC-AUC score of Linear SVM model:  0.7654742988240726


In [126]:
#Naive Beyes

In [127]:
nb = GaussianNB()
nb.fit(X_train_scaled,y_train)

In [129]:
y_predict_train = nb.predict_proba(X_train_scaled)
y_predict_valid = nb.predict_proba(X_validation_scaled)

In [132]:
train_score_nb = roc_auc_score(y_train,y_predict_train[:,1])
valid_score_nb = roc_auc_score(y_validation,y_predict_valid[:,1])

In [133]:
print("Training ROC-AUC score of Linear SVM model: ", train_score_nb)
print("Validation ROC-AUC score of Linear SVM model: ", valid_score_nb)

Training ROC-AUC score of Linear SVM model:  0.7709609454447421
Validation ROC-AUC score of Linear SVM model:  0.7647059479918943


In [135]:
#SVM with RBF kernel

In [141]:
svm_rbf = SVC(kernel='rbf', C=1, gamma=1,probability = True) 
  
# Fit the model to the training data 
svm_rbf.fit(X_train_scaled, y_train) 

In [142]:
y_predict_train = svm_rbf.predict_proba(X_train_scaled)
y_predict_valid = svm_rbf.predict_proba(X_validation_scaled)

In [143]:
train_score_svm_rbf = roc_auc_score(y_train,y_predict_train[:,1])
valid_score_svm_rbf = roc_auc_score(y_validation,y_predict_valid[:,1])

In [144]:
print("Training ROC-AUC score of Linear SVM model: ", train_score_svm_rbf)
print("Validation ROC-AUC score of Linear SVM model: ", valid_score_svm_rbf)

Training ROC-AUC score of Linear SVM model:  0.9879693274570842
Validation ROC-AUC score of Linear SVM model:  0.6796528584563863


In [145]:
Model_comparison = {
    'models':['Logistic Regression', 'Linear SVM', 'Naive Bayes', 'Random Forest',  'XGBoost', 'SVM RBF' ],
    'Train ROC-AUC score':[train_score_lr, train_score_lsvc, train_score_nb, train_score_rf,  train_score_xgb, train_score_svm_rbf ],
    'Validation ROC-AUC score':[valid_score_lr, valid_score_lsvc, valid_score_nb,valid_score_rf,  valid_score_xgb, valid_score_svm_rbf]
}

pd.DataFrame(Model_comparison)

Unnamed: 0,models,Train ROC-AUC score,Validation ROC-AUC score
0,Logistic Regression,0.795198,0.796627
1,Linear SVM,0.74813,0.765474
2,Naive Bayes,0.770961,0.764706
3,Random Forest,0.999835,0.787946
4,XGBoost,0.912469,0.793686
5,SVM RBF,0.987969,0.679653
