In [1]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


%matplotlib inline

bank_data = pd.read_csv('bank-additional-full.csv')



In [2]:
#Handling categorical variables
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month','day_of_week', 'poutcome', 'y']
bank_data_tran = pd.get_dummies(bank_data, columns = categorical_features, drop_first=True)

bank_data_tran.drop(['duration'], axis = 1, inplace = True)

#because of the high correlation coefficient between euribor3m rate and nr. of employed, the latter is going to be dropped
bank_data_tran.drop(['nr.employed'], axis = 1, inplace = True)

#apllying the same logic, emp.var.rate is going to be dropped
bank_data_tran.drop(['emp.var.rate'], axis = 1, inplace = True)

In [3]:
#Data preprocessing and splitting and scaling
x = bank_data_tran.iloc[:,bank_data_tran.columns != 'y_yes'].values
y = bank_data_tran.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

In [4]:
sm = SMOTE(random_state=12, ratio = 1.0, k_neighbors = 5)
x_train_res, y_train_res = sm.fit_sample(x_train, y_train)



In [5]:
#Function for printing results
def print_results(x_input, true_output, predicted_output, classifier):
    precision, recall, thresholds = precision_recall_curve(y_val, classifier.predict_proba(x_input)[:,1])
    area = auc(recall, precision)
    print("\nArea under precission-recall is:", area)
    
    print("Validation set predictions: \n\n" + classification_report(true_output,predicted_output))
    print("\n Confusion matrix:\n " , confusion_matrix(true_output, predicted_output))
    print("\n")
    print("True negatives:", confusion_matrix(true_output, predicted_output)[0][0])
    print("False positives:", confusion_matrix(true_output, predicted_output)[0][1])
    print("False negatives:", confusion_matrix(true_output, predicted_output)[1][0])
    print("True positives: ", confusion_matrix(true_output, predicted_output)[1][1])

In [6]:
#Decision tree algorithm

td_classifier = DecisionTreeClassifier(criterion='entropy', random_state = 0, splitter = 'best')
td_classifier.fit(x_train_res, y_train_res)

predictions_DT_val = td_classifier.predict(x_val)

print_results(x_val, y_val, predictions_DT_val, td_classifier)


Area under precission-recall is: 0.360806085614
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.91      0.90      0.90      5841
          1       0.30      0.34      0.32       749

avg / total       0.84      0.83      0.84      6590


 Confusion matrix:
  [[5233  608]
 [ 491  258]]


True negatives: 5233
False positives: 608
False negatives: 491
True positives:  258


In [7]:
#Boosting - AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),
                         algorithm = "SAMME",
                         n_estimators=200)
                         
ada.fit(x_train_res, y_train_res)
predictions_ada = ada.predict(x_val)



print_results(x_val, y_val, predictions_ada, ada)


Area under precission-recall is: 0.379999211899
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.92      0.96      0.94      5841
          1       0.51      0.33      0.40       749

avg / total       0.87      0.89      0.88      6590


 Confusion matrix:
  [[5604  237]
 [ 501  248]]


True negatives: 5604
False positives: 237
False negatives: 501
True positives:  248


In [8]:
#Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
gradientBoost = GradientBoostingClassifier(n_estimators=100, random_state=1)
gradientBoost.fit(x_train_res, y_train_res)

gradient_predictions = gradientBoost.predict(x_val)

#Results
print_results(x_val, y_val, gradient_predictions, gradientBoost)


Area under precission-recall is: 0.468162103388
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.93      0.96      0.94      5841
          1       0.54      0.40      0.46       749

avg / total       0.88      0.89      0.89      6590


 Confusion matrix:
  [[5582  259]
 [ 450  299]]


True negatives: 5582
False positives: 259
False negatives: 450
True positives:  299


In [9]:
#Bagging with decission trees
from sklearn.ensemble import BaggingClassifier
bagg = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=20), n_estimators=100, random_state=7)
bagg.fit(x_train_res, y_train_res)

predictions_bag = bagg.predict(x_val)

#Results
print_results(x_val, y_val, predictions_bag, bagg)


Area under precission-recall is: 0.429860219451
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.92      0.95      0.94      5841
          1       0.50      0.38      0.43       749

avg / total       0.87      0.89      0.88      6590


 Confusion matrix:
  [[5556  285]
 [ 468  281]]


True negatives: 5556
False positives: 285
False negatives: 468
True positives:  281


In [14]:
#Random Forest Classification Algorithm

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
classifier.fit(x_train_res, y_train_res)

# Predicting the Test set results
predictions_RF_val = classifier.predict(x_val)

print(classifier.feature_importances_ )
# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_val, predictions_RF)

#Results
print_results(x_val, y_val, predictions_RF_val, classifier)

#predictions_RF_test = classifier.predict(x_test)
#print("Test predictions:\n\n " + classification_report(y_test, predictions_RF_test))

[  7.30363816e-02   5.00202929e-02   2.50349005e-02   2.17032439e-02
   3.22342745e-02   3.96364869e-02   1.16257088e-01   2.71154462e-02
   4.97161396e-03   3.75555643e-03   9.38227248e-03   4.29249841e-03
   4.46755344e-03   1.24195488e-02   3.74028023e-03   2.36021210e-02
   3.70154717e-03   1.16128856e-03   2.95552783e-02   1.70495689e-02
   5.92463339e-04   6.74741636e-03   2.15809452e-02   2.70329211e-02
   6.82474482e-05   1.72573857e-02   3.41165943e-02   4.77972661e-03
   4.10433748e-02   0.00000000e+00   1.81649810e-03   6.60337212e-02
   1.87774862e-03   3.14395658e-02   3.75212691e-02   4.45385570e-03
   7.78116724e-04   4.78664941e-03   6.30596332e-03   3.39507529e-03
   1.71808864e-02   4.13896218e-03   6.44286399e-03   1.84183057e-03
   3.09913249e-02   3.01530666e-02   2.95176767e-02   2.60261832e-02
   1.87439450e-02   2.01984799e-02]

Area under precission-recall is: 0.433939719924
Validation set predictions: 

             precision    recall  f1-score   support

   

In [7]:
# Feature Scaling
sc = StandardScaler()
x_train_res = sc.fit_transform(x_train_res)
x_test = sc.transform(x_test)
x_val = sc.transform(x_val)

In [13]:
#Logistic Regression Algorithm

logdown = LogisticRegression()
logdown.fit(x_train_res, y_train_res)
predictions_LR = logdown.predict(x_val)

print("Coefficients: ",  logdown.coef_)

print_results(x_val, y_val, predictions_LR, logdown)

Coefficients:  [[ 0.00168964 -0.05494079 -0.00214708 -0.13766523  0.06176226  0.05381206
  -0.41568604 -0.19495463 -0.11010587 -0.16507294 -0.16904108  0.08520344
  -0.28733114 -0.30682795  0.26759707 -0.22627011 -0.2209257  -0.06888995
   0.02983128  0.03872416  0.12972041  0.0136768  -0.08023718  0.01568799
   0.00315044 -0.15112582  0.08693888 -0.04780459 -0.29960794  0.
  -0.33955041  0.04772519 -0.33955041 -0.19169678 -0.46884826 -0.52499315
   0.08560154  0.05249188  0.14891135  1.1000481  -0.7431984  -0.54084029
   0.62398396 -0.15852713 -0.19365159 -0.01070711 -0.09455541  0.02339637
   0.53454495  0.12333968]]

Area under precission-recall is: 0.450717257292
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.95      0.83      0.88      5841
          1       0.32      0.65      0.43       749

avg / total       0.88      0.81      0.83      6590


 Confusion matrix:
  [[4829 1012]
 [ 265  484]]


True negatives: 4829
False p

In [11]:
#SVM algorithm

#svm_classifier = SVC(kernel = 'linear', random_state = 0)
#svm_classifier.fit(x_train, y_train)

#predictions_SVC_val = svm_classifier.predict(x_val)
#print("Validation set predictions: \n\n" + classification_report(y_val,predictions_SVC_val))

#predictions_SVC_test = svm_classifier.predict(x_test)
#print("Test set predictions:\n\n" + classification_report(y_test, predictions_SVC_test))

In [12]:
#KNN Algorithm

#knn_classifier = KNeighborsClassifier(n_neighbors=25, metric = 'minkowski', p = 2)
#knn_classifier.fit(x_train, y_train)

#predictions_KNN_val = knn_classifier.predict(x_val)
#print("Validation set predictions: \n\n" + classification_report(y_val,predictions_KNN_val))

error_rate = []

# Will take some time
for i in range(1,10):    
    knn_classifier = KNeighborsClassifier(n_neighbors=i, metric = 'minkowski', p = 2)
    knn_classifier.fit(x_train, y_train)
    pred_i = knn_classifier.predict(x_val)
    error_rate.append(np.mean(pred_i != y_val))
#predictions_KNN_test = knn_classifier.predict(x_test)
#print("Test set predictions:\n\n" + classification_report(y_test, predictions_KNN_test))


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Milos\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-1a41fc8d73a3>", line 14, in <module>
    knn_classifier.fit(x_train, y_train)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\sklearn\neighbors\base.py", line 790, in fit
    return self._fit(X)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\sklearn\neighbors\base.py", line 248, in _fit
    **self.effective_metric_params_)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Milos\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1821, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most rec

KeyboardInterrupt: 

In [13]:
plt.figure(figsize=(7,5))
plt.plot(range(1,50),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Milos\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-85f724bcbbcb>", line 3, in <module>
    markerfacecolor='red', markersize=10)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\pyplot.py", line 3154, in plot
    ax = gca()
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\pyplot.py", line 936, in gca
    return gcf().gca(**kwargs)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\figure.py", line 1359, in gca
    return self.add_subplot(1, 1, 1, **kwargs)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\figure.py", line 1005, in add_subplot
    a = subplot_class_factory(projection_class)(self, *args, **kwargs)
  File "C:\Users\Milos\Anaconda3\lib\site-packages\matplotlib\axes\_subplots.py", line 73, in __init__
    self._axes_class.__init__(self, fig, self.figbox, *

KeyboardInterrupt: 

<matplotlib.figure.Figure at 0x23fc2c6db70>

In [14]:
knn_classifier = KNeighborsClassifier(n_neighbors=28, metric = 'minkowski', p = 2)
knn_classifier.fit(x_train_res, y_train_res)
pred_i = knn_classifier.predict(x_val)


#Results
print_results(x_val, y_val, pred_i, knn_classifier)

"""##Computing false and true positive rates
fpr, tpr,_=roc_curve(y_val, pred_i, drop_intermediate=False)

import matplotlib.pyplot as plt
plt.figure()
##Adding the ROC
plt.plot(fpr, tpr, color='red',
 lw=2, label='ROC curve')
##Random FPR and TPR
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
##Title and label
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show() """

KeyboardInterrupt: 

In [51]:
# Gaussian Naive Bayes
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
# load the iris datasets
# fit a Naive Bayes model to the data
model = GaussianNB()
model.fit(x_train_res, y_train_res)
# make predictions
predicted = model.predict(x_val)
# summarize the fit of the model

#Results
print_results(x_val, y_val, predicted, model)

GaussianNB(priors=None)

Area under precission-recall is: 0.481129774891
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.90      0.92      5841
          1       0.42      0.54      0.47       749

avg / total       0.88      0.86      0.87      6590


 Confusion matrix:
  [[5284  557]
 [ 345  404]]


True negatives: 5284
False positives: 557
False negatives: 345
True positives:  404


In [3]:
#Boosting 
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm = "SAMME",
                         n_estimators=200)
                         
ada.fit(x_train_res, y_train_res)


NameError: name 'DecisionTreeClassifier' is not defined