In [1]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


%matplotlib inline

bank_data = pd.read_csv('bank-additional-full.csv')



In [2]:
#Handling categorical variables
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month','day_of_week', 'poutcome', 'y']
bank_data_tran = pd.get_dummies(bank_data, columns = categorical_features, drop_first=True)

bank_data_tran.drop(['duration'], axis = 1, inplace = True)

#because of the high correlation coefficient between euribor3m rate and nr. of employed, the latter is going to be dropped
bank_data_tran.drop(['nr.employed'], axis = 1, inplace = True)

#apllying the same logic, emp.var.rate is going to be dropped
bank_data_tran.drop(['emp.var.rate'], axis = 1, inplace = True)

In [3]:
#Data preprocessing and splitting and scaling
x = bank_data_tran.iloc[:,bank_data_tran.columns != 'y_yes'].values
y = bank_data_tran.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

In [4]:
#Function for printing results
def print_results(x_input, true_output, predicted_output, classifier):
    precision, recall, thresholds = precision_recall_curve(y_val, classifier.predict_proba(x_input)[:,1])
    area = auc(recall, precision)
    print("\nArea under precission-recall is:", area)
    
    print("Validation set predictions: \n\n" + classification_report(true_output,predicted_output))
    print("\n Confusion matrix:\n " , confusion_matrix(true_output, predicted_output))
    print("\n")
    print("True negatives:", confusion_matrix(true_output, predicted_output)[0][0])
    print("False positives:", confusion_matrix(true_output, predicted_output)[0][1])
    print("False negatives:", confusion_matrix(true_output, predicted_output)[1][0])
    print("True positives: ", confusion_matrix(true_output, predicted_output)[1][1])

In [7]:
#Decision tree algorithm

td_classifier = DecisionTreeClassifier(criterion='entropy', random_state = 0, splitter = 'best')
td_classifier.fit(x_train, y_train)

predictions_DT_val = td_classifier.predict(x_val)

# Results
print_results(x_val, y_val, predictions_DT_val, td_classifier)

[ 0.17655697  0.08366491  0.02466451  0.01387023  0.01376965  0.00618376
  0.25482332  0.01789035  0.00648636  0.00830037  0.00731077  0.00702902
  0.00973201  0.01249322  0.00530349  0.01720459  0.00572544  0.00311054
  0.02251035  0.02016492  0.00090048  0.00628626  0.01398839  0.01736905
  0.          0.01070021  0.01931151  0.00600424  0.01833272  0.
  0.00301486  0.04322935  0.00266199  0.02405417  0.01137622  0.00175327
  0.00048349  0.00158515  0.00160326  0.          0.01443928  0.00115019
  0.00757747  0.00105571  0.01534252  0.01426485  0.01485609  0.01838741
  0.01118416  0.00229293]

Area under precission-recall is: 0.35237034355
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.91      0.91      0.91      5841
          1       0.30      0.32      0.31       749

avg / total       0.84      0.84      0.84      6590


 Confusion matrix:
  [[5287  554]
 [ 509  240]]


True negatives: 5287
False positives: 554
False negati

In [6]:
#Random Forest Classification Algorithm

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

# Predicting the Test set results
predictions_RF_val = classifier.predict(x_val)

# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_val, predictions_RF)


#Print results
print_results(x_val, y_val, predictions_RF_val, classifier)

#predictions_RF_test = classifier.predict(x_test)
#print("Test predictions:\n\n " + classification_report(y_test, predictions_RF_test))

"""
##Computing false and true positive rates
fpr, tpr,_=roc_curve(y_val, predictions_RF_val, drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() 

print("Score", roc_auc_score(y_val, predictions_RF_val))"""


Area under precission-recall is: 0.442366462517
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.91      0.97      0.94      5841
          1       0.56      0.30      0.39       749

avg / total       0.87      0.89      0.88      6590


 Confusion matrix:
  [[5670  171]
 [ 528  221]]


True negatives: 5670
False positives: 171
False negatives: 528
True positives:  221


'\n##Computing false and true positive rates\nfpr, tpr,_=roc_curve(y_val, predictions_RF_val, drop_intermediate=False)\n\nimport matplotlib.pyplot as plt\nplt.figure()\n##Adding the ROC\nplt.plot(fpr, tpr, color=\'red\',\n lw=2, label=\'ROC curve\')\n##Random FPR and TPR\nplt.plot([0, 1], [0, 1], color=\'blue\', lw=2, linestyle=\'--\')\n##Title and label\nplt.xlabel(\'FPR\')\nplt.ylabel(\'TPR\')\nplt.title(\'ROC curve\')\nplt.show() \n\nprint("Score", roc_auc_score(y_val, predictions_RF_val))'

In [12]:
# Feature Scaling
sc = StandardScaler()
x_train_res = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
x_val = sc.transform(x_val)

In [13]:
#Logistic Regression Algorithm

logdown = LogisticRegression()
logdown.fit(x_train, y_train)
predictions_LR = logdown.predict(x_val)

#Results
print_results(x_val, y_val, predictions_LR, logdown)


Area under precission-recall is: 0.292483220981
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.56      0.70      5841
          1       0.17      0.72      0.28       749

avg / total       0.85      0.58      0.65      6590


 Confusion matrix:
  [[3278 2563]
 [ 213  536]]


True negatives: 3278
False positives: 2563
False negatives: 213
True positives:  536


"##Computing false and true positive rates\nfpr, tpr,_=roc_curve(predictions_LR,y_val,drop_intermediate=False)\n\nimport matplotlib.pyplot as plt\nplt.figure()\n##Adding the ROC\nplt.plot(fpr, tpr, color='red',\n lw=2, label='ROC curve')\n##Random FPR and TPR\nplt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')\n##Title and label\nplt.xlabel('FPR')\nplt.ylabel('TPR')\nplt.title('ROC curve')\nplt.show() "

In [15]:
knn_classifier = KNeighborsClassifier(n_neighbors=28, metric = 'minkowski', p = 2)
knn_classifier.fit(x_train, y_train)
pred_i = knn_classifier.predict(x_val)

#Results
print_results(x_val, y_val, pred_i, knn_classifier)
"""##Computing false and true positive rates
fpr, tpr,_=roc_curve(y_val, pred_i, drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() """


Area under precission-recall is: 0.317915895954
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.00      0.00      0.00      5841
          1       0.11      1.00      0.20       749

avg / total       0.01      0.11      0.02      6590


 Confusion matrix:
  [[   0 5841]
 [   0  749]]


True negatives: 0
False positives: 5841
False negatives: 0
True positives:  749


  'precision', 'predicted', average, warn_for)


"##Computing false and true positive rates\nfpr, tpr,_=roc_curve(y_val, pred_i, drop_intermediate=False)\n\nimport matplotlib.pyplot as plt\nplt.figure()\n##Adding the ROC\nplt.plot(fpr, tpr, color='red',\n lw=2, label='ROC curve')\n##Random FPR and TPR\nplt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')\n##Title and label\nplt.xlabel('FPR')\nplt.ylabel('TPR')\nplt.title('ROC curve')\nplt.show() "

In [16]:
# Gaussian Naive Bayes
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
# load the iris datasets
# fit a Naive Bayes model to the data
model = GaussianNB()
model.fit(x_train, y_train)
print(model)
# make predictions
predicted = model.predict(x_val)
# summarize the fit of the model

print_results(x_val, y_val, predicted, model)

GaussianNB(priors=None)

Area under precission-recall is: 0.556828528073
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.00      0.00      0.00      5841
          1       0.11      1.00      0.20       749

avg / total       0.01      0.11      0.02      6590


 Confusion matrix:
  [[   0 5841]
 [   0  749]]


True negatives: 0
False positives: 5841
False negatives: 0
True positives:  749


  'precision', 'predicted', average, warn_for)
