In [26]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, average_precision_score, precision_recall_curve, auc
from sklearn.cross_validation import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings; warnings.simplefilter('ignore')
%matplotlib inline

bank_data = pd.read_csv('bank-additional-full.csv')

In [27]:
#Handling categorical variables
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month','day_of_week', 'poutcome', 'y']
bank_data_tran = pd.get_dummies(bank_data, columns = categorical_features, drop_first=True)

bank_data_tran.drop(['duration'], axis = 1, inplace = True)

#because of the high correlation coefficient between euribor3m rate and nr. of employed, the latter is going to be dropped
bank_data_tran.drop(['nr.employed'], axis = 1, inplace = True)

#apllying the same logic, emp.var.rate is going to be dropped
bank_data_tran.drop(['emp.var.rate'], axis = 1, inplace = True)

In [28]:
#Data preprocessing and splitting and scaling
x = bank_data_tran.iloc[:,bank_data_tran.columns != 'y_yes'].values
y = bank_data_tran.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

In [29]:
#Function for printing results
def print_results(x_input, true_output, predicted_output, classifier):
    precision, recall, thresholds = precision_recall_curve(y_val, classifier.predict_proba(x_input)[:,1])
    area = auc(recall, precision)
    print("\nArea under precission-recall is:", area)
    
    print("Validation set predictions: \n\n" + classification_report(true_output,predicted_output))
    print("\n Confusion matrix:\n " , confusion_matrix(true_output, predicted_output))
    print("\n")
    print("True negatives:", confusion_matrix(true_output, predicted_output)[0][0])
    print("False positives:", confusion_matrix(true_output, predicted_output)[0][1])
    print("False negatives:", confusion_matrix(true_output, predicted_output)[1][0])
    print("True positives: ", confusion_matrix(true_output, predicted_output)[1][1])

In [30]:
#Decision tree algorithm

td_classifier = DecisionTreeClassifier(criterion='entropy', random_state = 0, splitter = 'best')
td_classifier.fit(x_train, y_train)

predictions_DT_val = td_classifier.predict(x_val)

# Results
print_results(x_val, y_val, predictions_DT_val, td_classifier)


Area under precission-recall is: 0.35237034355014546
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.91      0.91      0.91      5841
          1       0.30      0.32      0.31       749

avg / total       0.84      0.84      0.84      6590


 Confusion matrix:
  [[5287  554]
 [ 509  240]]


True negatives: 5287
False positives: 554
False negatives: 509
True positives:  240


In [31]:
#Boosting - AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),
                         algorithm = "SAMME",
                         n_estimators=200)
                         
ada.fit(x_train, y_train)
predictions_ada = ada.predict(x_val)


print_results(x_val, y_val, predictions_ada, ada)


Area under precission-recall is: 0.3537013618945843
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.92      0.96      0.94      5841
          1       0.51      0.31      0.39       749

avg / total       0.87      0.89      0.88      6590


 Confusion matrix:
  [[5611  230]
 [ 514  235]]


True negatives: 5611
False positives: 230
False negatives: 514
True positives:  235


In [32]:
#Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
gradientBoost = GradientBoostingClassifier(n_estimators=100, random_state=1)
gradientBoost.fit(x_train, y_train)

gradient_predictions = gradientBoost.predict(x_val)

#Results
print_results(x_val, y_val, gradient_predictions, gradientBoost)


Area under precission-recall is: 0.4830523120799327
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.91      0.98      0.95      5841
          1       0.66      0.24      0.35       749

avg / total       0.88      0.90      0.88      6590


 Confusion matrix:
  [[5749   92]
 [ 571  178]]


True negatives: 5749
False positives: 92
False negatives: 571
True positives:  178


In [33]:
#Bagging with decission trees
from sklearn.ensemble import BaggingClassifier
bagg = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=20), n_estimators=100, random_state=7)
bagg.fit(x_train, y_train)

predictions_bag = bagg.predict(x_val)

#Results
print_results(x_val, y_val, predictions_bag, bagg)


Area under precission-recall is: 0.44522174574666057
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.92      0.97      0.94      5841
          1       0.56      0.31      0.40       749

avg / total       0.88      0.89      0.88      6590


 Confusion matrix:
  [[5656  185]
 [ 517  232]]


True negatives: 5656
False positives: 185
False negatives: 517
True positives:  232


In [34]:
#Random Forest Classification Algorithm

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

# Predicting the Test set results
predictions_RF_val = classifier.predict(x_val)

#Print results
print_results(x_val, y_val, predictions_RF_val, classifier)


Area under precission-recall is: 0.44236646251681155
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.91      0.97      0.94      5841
          1       0.56      0.30      0.39       749

avg / total       0.87      0.89      0.88      6590


 Confusion matrix:
  [[5670  171]
 [ 528  221]]


True negatives: 5670
False positives: 171
False negatives: 528
True positives:  221


In [35]:
# Feature Scaling
sc = StandardScaler()
x_train_res = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
x_val = sc.transform(x_val)

In [36]:
#Logistic Regression Algorithm

logdown = LogisticRegression()
logdown.fit(x_train, y_train)
predictions_LR = logdown.predict(x_val)

#Results
print_results(x_val, y_val, predictions_LR, logdown)


Area under precission-recall is: 0.2924832209805112
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.56      0.70      5841
          1       0.17      0.72      0.28       749

avg / total       0.85      0.58      0.65      6590


 Confusion matrix:
  [[3278 2563]
 [ 213  536]]


True negatives: 3278
False positives: 2563
False negatives: 213
True positives:  536


In [37]:
knn_classifier = KNeighborsClassifier(n_neighbors=2, metric = 'minkowski', p = 2, leaf_size = 15)
knn_classifier.fit(x_train, y_train)
pred_i = knn_classifier.predict(x_val)

#Results
print_results(x_val, y_val, pred_i, knn_classifier)


Area under precission-recall is: 0.4361633482642426
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.77      0.11      0.19      5841
          1       0.10      0.75      0.17       749

avg / total       0.70      0.18      0.19      6590


 Confusion matrix:
  [[ 647 5194]
 [ 190  559]]


True negatives: 647
False positives: 5194
False negatives: 190
True positives:  559


In [43]:
#SVM algorithm

svm_classifier = SVC(kernel = 'rbf', random_state = 0)
svm_classifier.fit(x_train, y_train)

predictions_SVC_val = svm_classifier.predict(x_val)

#print_results(x_val, y_val, predictions_SVC_val, svm_classifier)
print("Validation set predictions: \n\n" + classification_report(y_val,predictions_SVC_val))
print("\n Confusion matrix:\n " , confusion_matrix(y_val, predictions_SVC_val))
print("True negatives:", confusion_matrix(y_val, predictions_SVC_val)[0][0])
print("False positives:", confusion_matrix(y_val, predictions_SVC_val)[0][1])
print("False negatives:", confusion_matrix(y_val, predictions_SVC_val)[1][0])
print("True positives: ", confusion_matrix(y_val, predictions_SVC_val)[1][1])

#predictions_SVC_test = svm_classifier.predict(x_test)
#print("Test set predictions:\n\n" + classification_report(y_test, predictions_SVC_test))

Validation set predictions: 

             precision    recall  f1-score   support

          0       0.00      0.00      0.00      5841
          1       0.11      1.00      0.20       749

avg / total       0.01      0.11      0.02      6590


 Confusion matrix:
  [[   0 5841]
 [   0  749]]
True negatives: 0
False positives: 5841
False negatives: 0
True positives:  749


In [38]:
# Gaussian Naive Bayes
gaussian_classifier = GaussianNB()
gaussian_classifier.fit(x_train, y_train)
#print(gaussian_classifier)
# make predictions
predicted = gaussian_classifier.predict(x_val)
# summarize the fit of the model

print_results(x_val, y_val, predicted, gaussian_classifier)


Area under precission-recall is: 0.5568285280728377
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.00      0.00      0.00      5841
          1       0.11      1.00      0.20       749

avg / total       0.01      0.11      0.02      6590


 Confusion matrix:
  [[   0 5841]
 [   0  749]]


True negatives: 0
False positives: 5841
False negatives: 0
True positives:  749
