In [4]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


%matplotlib inline

bank_data = pd.read_csv('bank-additional-full.csv')



In [15]:
#Handling categorical variables
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month','day_of_week', 'poutcome', 'y']
bank_data_tran = pd.get_dummies(bank_data, columns = categorical_features, drop_first=True)

bank_data_tran.drop(['duration'], axis = 1, inplace = True)

#because of the high correlation coefficient between euribor3m rate and nr. of employed, the latter is going to be dropped
bank_data_tran.drop(['nr.employed'], axis = 1, inplace = True)

#apllying the same logic, emp.var.rate is going to be dropped
bank_data_tran.drop(['emp.var.rate'], axis = 1, inplace = True)

In [6]:
#Data preprocessing and splitting and scaling
x = bank_data_tran.iloc[:,bank_data_tran.columns != 'y_yes'].values
y = bank_data_tran.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

In [7]:
sm = SMOTE(random_state=12, ratio = 1.0, k_neighbors = 5)
x_train_res, y_train_res = sm.fit_sample(x_train, y_train)



In [50]:
#Function for printing results
def print_results(x_input, true_output, predicted_output, classifier):
    precision, recall, thresholds = precision_recall_curve(y_val, classifier.predict_proba(x_input)[:,1])
    area = auc(recall, precision)
    print("\nArea under precission-recall is:", area)
    
    print("Validation set predictions: \n\n" + classification_report(true_output,predicted_output))
    print("\n Confusion matrix:\n " , confusion_matrix(true_output, predicted_output))
    print("\n")
    print("True negatives:", confusion_matrix(true_output, predicted_output)[0][0])
    print("False positives:", confusion_matrix(true_output, predicted_output)[0][1])
    print("False negatives:", confusion_matrix(true_output, predicted_output)[1][0])
    print("True positives: ", confusion_matrix(true_output, predicted_output)[1][1])

In [49]:
#Decision tree algorithm

td_classifier = DecisionTreeClassifier(criterion='entropy', random_state = 0, splitter = 'best')
td_classifier.fit(x_train_res, y_train_res)

predictions_DT_val = td_classifier.predict(x_val)

print_results(x_val, y_val, predictions_DT_val, td_classifier)


Area under precission-recall is: 0.54447406992
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.93      0.94      5841
          1       0.51      0.53      0.52       749

avg / total       0.89      0.89      0.89      6590


 Confusion matrix:
  [[5461  380]
 [ 355  394]]
True negatives: 5461
False positives: 380
False negatives: 355
True positives:  394


In [35]:
#Boosting 
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),
                         algorithm = "SAMME",
                         n_estimators=200)
                         
ada.fit(x_train_res, y_train_res)
predictions_ada = ada.predict(x_val)

print_results(x_val, y_val, predictions_ada, ada)

Validation set predictions: 

             precision    recall  f1-score   support

          0       0.95      0.95      0.95      5841
          1       0.61      0.57      0.59       749

avg / total       0.91      0.91      0.91      6590


 Confusion matrix:
  [[5562  279]
 [ 319  430]]

Area under precission-recall is: 0.614624516763


In [36]:
#Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
gradientBoost = GradientBoostingClassifier(n_estimators=100, random_state=1)
gradientBoost.fit(x_train_res, y_train_res)

gradient_predictions = gradientBoost.predict(x_val)

#Results
print_results(x_val, y_val, gradient_predictions, gradientBoost)

Validation set predictions: 

             precision    recall  f1-score   support

          0       0.95      0.94      0.95      5841
          1       0.59      0.64      0.61       749

avg / total       0.91      0.91      0.91      6590


 Confusion matrix:
  [[5504  337]
 [ 273  476]]

Area under precission-recall is: 0.628834260863


In [40]:
#Bagging with decission trees
from sklearn.ensemble import BaggingClassifier
bagg = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=20), n_estimators=100, random_state=7)
bagg.fit(x_train_res, y_train_res)

predictions_bag = bagg.predict(x_val)

#Results
print_results(x_val, y_val, predictions_bag, bagg)


Area under precission-recall is: 0.603609082238
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.95      0.95      0.95      5841
          1       0.58      0.58      0.58       749

avg / total       0.90      0.90      0.90      6590


 Confusion matrix:
  [[5524  317]
 [ 314  435]]
True negatives: 5524
False positives: 317
False negatives: 314
True positives:  435


In [41]:
#Random Forest Classification Algorithm

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
classifier.fit(x_train_res, y_train_res)

# Predicting the Test set results
predictions_RF_val = classifier.predict(x_val)

# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_val, predictions_RF)

#Results
print_results(x_val, y_val, predictions_RF_val, classifier)

#predictions_RF_test = classifier.predict(x_test)
#print("Test predictions:\n\n " + classification_report(y_test, predictions_RF_test))

"""
##Computing false and true positive rates
fpr, tpr,_=roc_curve(y_val, predictions_RF_val, drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() 

print("Score", roc_auc_score(y_val, predictions_RF_val))"""


Area under precission-recall is: 0.649656501921
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.96      0.95      5841
          1       0.62      0.50      0.56       749

avg / total       0.90      0.91      0.90      6590


 Confusion matrix:
  [[5613  228]
 [ 372  377]]
True negatives: 5613
False positives: 228
False negatives: 372
True positives:  377


'\n##Computing false and true positive rates\nfpr, tpr,_=roc_curve(y_val, predictions_RF_val, drop_intermediate=False)\n\nimport matplotlib.pyplot as plt\nplt.figure()\n##Adding the ROC\nplt.plot(fpr, tpr, color=\'red\',\n lw=2, label=\'ROC curve\')\n##Random FPR and TPR\nplt.plot([0, 1], [0, 1], color=\'blue\', lw=2, linestyle=\'--\')\n##Title and label\nplt.xlabel(\'FPR\')\nplt.ylabel(\'TPR\')\nplt.title(\'ROC curve\')\nplt.show() \n\nprint("Score", roc_auc_score(y_val, predictions_RF_val))'

In [7]:
# Feature Scaling
sc = StandardScaler()
x_train_res = sc.fit_transform(x_train_res)
x_test = sc.transform(x_test)
x_val = sc.transform(x_val)

In [42]:
#Logistic Regression Algorithm

logdown = LogisticRegression()
logdown.fit(x_train_res, y_train_res)
predictions_LR = logdown.predict(x_val)

print_results(x_val, y_val, predictions_LR, logdown)

"""##Computing false and true positive rates
fpr, tpr,_=roc_curve(predictions_LR,y_val,drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() """


Area under precission-recall is: 0.562911681776
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.98      0.86      0.91      5841
          1       0.44      0.88      0.59       749

avg / total       0.92      0.86      0.88      6590


 Confusion matrix:
  [[5002  839]
 [  91  658]]
True negatives: 5002
False positives: 839
False negatives: 91
True positives:  658


"##Computing false and true positive rates\nfpr, tpr,_=roc_curve(predictions_LR,y_val,drop_intermediate=False)\n\nimport matplotlib.pyplot as plt\nplt.figure()\n##Adding the ROC\nplt.plot(fpr, tpr, color='red',\n lw=2, label='ROC curve')\n##Random FPR and TPR\nplt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')\n##Title and label\nplt.xlabel('FPR')\nplt.ylabel('TPR')\nplt.title('ROC curve')\nplt.show() "

In [43]:
#y_pred_prob = logdown.predict_proba(x_val)[:, 1]

##plt.rcParams['font.size'] = 14
#plt.hist(y_pred_prob, bins = 8)
#plt.xlim(0,1)
#plt.title("Historgram of predicted probabilities")
#plt.xlabel("Predicted probability of telemarketing")
#plt.ylabel("Frequency")


#Since we can see highest number of instances have class probability around 0.25, we will set that as a limit, maybe 0.3
y_pred_prob = y_pred_prob.reshape(1,-1)
from sklearn.preprocessing import binarize
y_pred_class = binarize(y_pred_prob, 0.1)[0]

print (confusion_matrix(y_val, y_pred_class))
print(classification_report(y_val,y_pred_class))

from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_val, y_pred_class)
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

##Computing false and true positive rates
fpr, tpr,_=roc_curve(y_val, y_pred_class,drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() 



In [11]:
#SVM algorithm

#svm_classifier = SVC(kernel = 'linear', random_state = 0)
#svm_classifier.fit(x_train, y_train)

#predictions_SVC_val = svm_classifier.predict(x_val)
#print("Validation set predictions: \n\n" + classification_report(y_val,predictions_SVC_val))

#predictions_SVC_test = svm_classifier.predict(x_test)
#print("Test set predictions:\n\n" + classification_report(y_test, predictions_SVC_test))

In [12]:
#KNN Algorithm

#knn_classifier = KNeighborsClassifier(n_neighbors=25, metric = 'minkowski', p = 2)
#knn_classifier.fit(x_train, y_train)

#predictions_KNN_val = knn_classifier.predict(x_val)
#print("Validation set predictions: \n\n" + classification_report(y_val,predictions_KNN_val))

error_rate = []

# Will take some time
for i in range(1,10):    
    knn_classifier = KNeighborsClassifier(n_neighbors=i, metric = 'minkowski', p = 2)
    knn_classifier.fit(x_train, y_train)
    pred_i = knn_classifier.predict(x_val)
    error_rate.append(np.mean(pred_i != y_val))
#predictions_KNN_test = knn_classifier.predict(x_test)
#print("Test set predictions:\n\n" + classification_report(y_test, predictions_KNN_test))


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Milos\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-1a41fc8d73a3>", line 14, in <module>
    knn_classifier.fit(x_train, y_train)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\sklearn\neighbors\base.py", line 790, in fit
    return self._fit(X)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\sklearn\neighbors\base.py", line 248, in _fit
    **self.effective_metric_params_)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Milos\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1821, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most rec

KeyboardInterrupt: 

In [13]:
plt.figure(figsize=(7,5))
plt.plot(range(1,50),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Milos\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-85f724bcbbcb>", line 3, in <module>
    markerfacecolor='red', markersize=10)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\pyplot.py", line 3154, in plot
    ax = gca()
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\pyplot.py", line 936, in gca
    return gcf().gca(**kwargs)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\figure.py", line 1359, in gca
    return self.add_subplot(1, 1, 1, **kwargs)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\figure.py", line 1005, in add_subplot
    a = subplot_class_factory(projection_class)(self, *args, **kwargs)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\axes\_subplots.py", line 73, in __init__
    self._axes_class.__init__(self, fig, self.figbox, *

KeyboardInterrupt: 

<matplotlib.figure.Figure at 0x23fc2c6db70>

In [14]:
knn_classifier = KNeighborsClassifier(n_neighbors=28, metric = 'minkowski', p = 2)
knn_classifier.fit(x_train_res, y_train_res)
pred_i = knn_classifier.predict(x_val)


#Results
print_results(x_val, y_val, pred_i, knn_classifier)

"""##Computing false and true positive rates
fpr, tpr,_=roc_curve(y_val, pred_i, drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() """

KeyboardInterrupt: 

In [51]:
# Gaussian Naive Bayes
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
# load the iris datasets
# fit a Naive Bayes model to the data
model = GaussianNB()
model.fit(x_train_res, y_train_res)
# make predictions
predicted = model.predict(x_val)
# summarize the fit of the model

#Results
print_results(x_val, y_val, predicted, model)

GaussianNB(priors=None)

Area under precission-recall is: 0.481129774891
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.90      0.92      5841
          1       0.42      0.54      0.47       749

avg / total       0.88      0.86      0.87      6590


 Confusion matrix:
  [[5284  557]
 [ 345  404]]


True negatives: 5284
False positives: 557
False negatives: 345
True positives:  404


In [3]:
#Boosting 
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm = "SAMME",
                         n_estimators=200)
                         
ada.fit(x_train_res, y_train_res)


NameError: name 'DecisionTreeClassifier' is not defined