In [1]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


%matplotlib inline

bank_data = pd.read_csv('bank-additional-full.csv')



In [2]:
#Handling categorical variables
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month','day_of_week', 'poutcome', 'y']
bank_data_tran = pd.get_dummies(bank_data, columns = categorical_features, drop_first=True)

#because of the high correlation coefficient between euribor3m rate and nr. of employed, the latter is going to be dropped
bank_data_tran.drop(['nr.employed'], axis = 1, inplace = True)

#apllying the same logic, emp.var.rate is going to be dropped
bank_data_tran.drop(['emp.var.rate'], axis = 1, inplace = True)

In [3]:
#Data preprocessing and splitting and scaling
x = bank_data_tran.iloc[:,bank_data_tran.columns != 'y_yes'].values
y = bank_data_tran.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

In [9]:
#Decision tree algorithm

td_classifier = DecisionTreeClassifier(criterion='entropy', random_state = 0, splitter = 'best')
td_classifier.fit(x_train, y_train)

predictions_DT_val = td_classifier.predict(x_val)
print("Validation set predictions: \n\n" + classification_report(y_val,predictions_DT_val))

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, predictions_DT_val)
print("Confusion matrix looks like this: \n", cm)

print("\n Confusion matrix:\n " , confusion_matrix(y_val, predictions_DT_val))

precision, recall, thresholds = precision_recall_curve(y_val, td_classifier.predict_proba(x_val)[:,1])
area = auc(recall, precision)
print("\nArea under precission-recall is:", area)

Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.93      0.94      5841
          1       0.50      0.54      0.52       749

avg / total       0.89      0.89      0.89      6590

[[5437  404]
 [ 343  406]] 


 Confusion matrix:
  [[5437  404]
 [ 343  406]]

Area under precission-recall is: 0.547669600545


In [8]:
#Random Forest Classification Algorithm

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

# Predicting the Test set results
predictions_RF_val = classifier.predict(x_val)

# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_val, predictions_RF)


# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, predictions_RF_val)
print("Confusion matrix looks like this: \n", cm)

from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_val, predictions_RF_val)
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

precision, recall, thresholds = precision_recall_curve(y_val, classifier.predict_proba(x_val)[:,1])
area = auc(recall, precision)
print("\nArea under precission-recall is:", area)

print("\nValidation set predictions: \n\n" + classification_report(y_val,predictions_RF_val))

#predictions_RF_test = classifier.predict(x_test)
#print("Test predictions:\n\n " + classification_report(y_test, predictions_RF_test))

"""
##Computing false and true positive rates
fpr, tpr,_=roc_curve(y_val, predictions_RF_val, drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() 

print("Score", roc_auc_score(y_val, predictions_RF_val))"""

[[5646  195]
 [ 390  359]]
Average precision-recall score: 0.37

Area under precission-recall is: 0.657836388889

Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.97      0.95      5841
          1       0.65      0.48      0.55       749

avg / total       0.90      0.91      0.91      6590



'\n##Computing false and true positive rates\nfpr, tpr,_=roc_curve(y_val, predictions_RF_val, drop_intermediate=False)\n\nimport matplotlib.pyplot as plt\nplt.figure()\n##Adding the ROC\nplt.plot(fpr, tpr, color=\'red\',\n lw=2, label=\'ROC curve\')\n##Random FPR and TPR\nplt.plot([0, 1], [0, 1], color=\'blue\', lw=2, linestyle=\'--\')\n##Title and label\nplt.xlabel(\'FPR\')\nplt.ylabel(\'TPR\')\nplt.title(\'ROC curve\')\nplt.show() \n\nprint("Score", roc_auc_score(y_val, predictions_RF_val))'

In [10]:
# Feature Scaling
sc = StandardScaler()
x_train_res = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
x_val = sc.transform(x_val)

In [12]:
#Logistic Regression Algorithm

logdown = LogisticRegression()
logdown.fit(x_train, y_train)
predictions_LR = logdown.predict(x_val)

precision, recall, thresholds = precision_recall_curve(y_val, logdown.predict_proba(x_val)[:,1])
area = auc(recall, precision)
print("Area is", area)

print(classification_report(y_val,predictions_LR))

precision, recall, thresholds = precision_recall_curve(y_val, logdown.predict_proba(x_val)[:,1])
area = auc(recall, precision)
print("\nArea under precission-recall is:", area)

"""##Computing false and true positive rates
fpr, tpr,_=roc_curve(predictions_LR,y_val,drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() """

Area is 0.3039099889
             precision    recall  f1-score   support

          0       0.94      0.49      0.64      5841
          1       0.16      0.77      0.27       749

avg / total       0.85      0.52      0.60      6590


Area under precission-recall is: 0.3039099889


"##Computing false and true positive rates\nfpr, tpr,_=roc_curve(predictions_LR,y_val,drop_intermediate=False)\n\nimport matplotlib.pyplot as plt\nplt.figure()\n##Adding the ROC\nplt.plot(fpr, tpr, color='red',\n lw=2, label='ROC curve')\n##Random FPR and TPR\nplt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')\n##Title and label\nplt.xlabel('FPR')\nplt.ylabel('TPR')\nplt.title('ROC curve')\nplt.show() "

In [14]:
knn_classifier = KNeighborsClassifier(n_neighbors=28, metric = 'minkowski', p = 2)
knn_classifier.fit(x_train, y_train)
pred_i = knn_classifier.predict(x_val)

print("Validation set predictions: \n\n" + classification_report(y_val,pred_i))

precision, recall, thresholds = precision_recall_curve(y_val, knn_classifier.predict_proba(x_val)[:,1])
area = auc(recall, precision)
print("\nArea under precission-recall is:", area)

"""##Computing false and true positive rates
fpr, tpr,_=roc_curve(y_val, pred_i, drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() """

  'precision', 'predicted', average, warn_for)


Validation set predictions: 

             precision    recall  f1-score   support

          0       0.89      1.00      0.94      5841
          1       0.00      0.00      0.00       749

avg / total       0.79      0.89      0.83      6590


Area under precission-recall is: 0.292691997511


"##Computing false and true positive rates\nfpr, tpr,_=roc_curve(y_val, pred_i, drop_intermediate=False)\n\nimport matplotlib.pyplot as plt\nplt.figure()\n##Adding the ROC\nplt.plot(fpr, tpr, color='red',\n lw=2, label='ROC curve')\n##Random FPR and TPR\nplt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')\n##Title and label\nplt.xlabel('FPR')\nplt.ylabel('TPR')\nplt.title('ROC curve')\nplt.show() "

In [15]:
# Gaussian Naive Bayes
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
# load the iris datasets
# fit a Naive Bayes model to the data
model = GaussianNB()
model.fit(x_train, y_train)
print(model)
# make predictions
predicted = model.predict(x_val)
# summarize the fit of the model
print(metrics.classification_report(y_val, predicted))
print(metrics.confusion_matrix(y_val, predicted))

precision, recall, thresholds = precision_recall_curve(y_val, model.predict_proba(x_val)[:,1])
area = auc(recall, precision)
print("\nArea under precission-recall is:", area)

GaussianNB(priors=None)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      5841
          1       0.11      1.00      0.20       749

avg / total       0.01      0.11      0.02      6590

[[   0 5841]
 [   0  749]]

Area under precission-recall is: 0.556828528073


  'precision', 'predicted', average, warn_for)
