In [19]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, average_precision_score, precision_recall_curve, auc
from sklearn.cross_validation import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings; warnings.simplefilter('ignore')
%matplotlib inline


bank_data = pd.read_csv('bank-additional-full.csv')

In [2]:
#Handling categorical variables
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month','day_of_week', 'poutcome', 'y']
bank_data_tran = pd.get_dummies(bank_data, columns = categorical_features, drop_first=True)

bank_data_tran.drop(['duration'], axis = 1, inplace = True)

#because of the high correlation coefficient between euribor3m rate and nr. of employed, the latter is going to be dropped
bank_data_tran.drop(['nr.employed'], axis = 1, inplace = True)

#apllying the same logic, emp.var.rate is going to be dropped
bank_data_tran.drop(['emp.var.rate'], axis = 1, inplace = True)

In [3]:
#Function for printing results
def print_results(x_input, true_output, predicted_output, classifier):
    precision, recall, thresholds = precision_recall_curve(y_val, classifier.predict_proba(x_input)[:,1])
    area = auc(recall, precision)
    print("\nArea under precission-recall is:", area)
    
    print("Validation set predictions: \n\n" + classification_report(true_output,predicted_output))
    print("\n Confusion matrix:\n " , confusion_matrix(true_output, predicted_output))
    print("\n")
    print("True negatives:", confusion_matrix(true_output, predicted_output)[0][0])
    print("False positives:", confusion_matrix(true_output, predicted_output)[0][1])
    print("False negatives:", confusion_matrix(true_output, predicted_output)[1][0])
    print("True positives: ", confusion_matrix(true_output, predicted_output)[1][1])

In [4]:
#Data preprocessing and splitting and scaling
msk = np.random.rand(len(bank_data_tran)) < 0.8

train = bank_data_tran[msk]
test = bank_data_tran[~msk]

msk2 = np.random.rand(len(train)) < 0.8

train2 = train[msk2]
val = train[~msk2]


x = bank_data_tran.iloc[:,bank_data_tran.columns != 'y_yes'].values
y = bank_data_tran.iloc[:, -1].values

In [5]:
#Downsampled

# Separate majority and minority classes
df_majority = train2[train2.y_yes ==0]
df_minority = train2[train2.y_yes ==1]

In [6]:
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=5000,     # to match minority class
                                 random_state=123) # reproducible results

In [7]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [8]:
df_downsampled.y_yes.value_counts()

0    5000
1    2946
Name: y_yes, dtype: int64

In [9]:
#Data preprocessing and splitting 
x_down = df_downsampled.iloc[:,df_downsampled.columns != 'y_yes'].values
y_down = df_downsampled.iloc[:, -1].values

x_train = x_down
y_train = y_down

x_val = val.iloc[:, val.columns != 'y_yes'].values
y_val = val.iloc[:, -1].values

x_test = test.iloc[:, test.columns!='y_yes'].values
y_test = test.iloc[:, -1].values

In [10]:
#Decision tree algorithm
td_classifier = DecisionTreeClassifier(criterion='entropy', random_state = 0)
td_classifier.fit(x_train, y_train)

predictions_DT_val = td_classifier.predict(x_val)

#Results 
print_results(x_val, y_val, predictions_DT_val, td_classifier)


Area under precission-recall is: 0.4262191568742837
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.93      0.75      0.83      5805
          1       0.23      0.57      0.33       784

avg / total       0.85      0.73      0.77      6589


 Confusion matrix:
  [[4356 1449]
 [ 339  445]]


True negatives: 4356
False positives: 1449
False negatives: 339
True positives:  445


In [13]:
#Boosting - AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),
                         algorithm = "SAMME",
                         n_estimators=200)
                         
ada.fit(x_train, y_train)
predictions_ada = ada.predict(x_val)



print_results(x_val, y_val, predictions_ada, ada)


Area under precission-recall is: 0.39913235270112163
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.93      0.84      0.88      5805
          1       0.32      0.57      0.41       784

avg / total       0.86      0.80      0.83      6589


 Confusion matrix:
  [[4851  954]
 [ 338  446]]


True negatives: 4851
False positives: 954
False negatives: 338
True positives:  446


In [12]:
#Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
gradientBoost = GradientBoostingClassifier(n_estimators=100, random_state=1)
gradientBoost.fit(x_train, y_train)

gradient_predictions = gradientBoost.predict(x_val)

#Results
print_results(x_val, y_val, gradient_predictions, gradientBoost)


Area under precission-recall is: 0.4671765064134424
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.90      0.92      5805
          1       0.44      0.58      0.50       784

avg / total       0.88      0.86      0.87      6589


 Confusion matrix:
  [[5234  571]
 [ 328  456]]


True negatives: 5234
False positives: 571
False negatives: 328
True positives:  456


In [14]:
#Bagging with decission trees
from sklearn.ensemble import BaggingClassifier
bagg = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=20), n_estimators=100, random_state=7)
bagg.fit(x_train_res, y_train_res)

predictions_bag = bagg.predict(x_val)

#Results
print_results(x_val, y_val, predictions_bag, bagg)

NameError: name 'x_train_res' is not defined

In [12]:
#Random Forest Classification Algorithm

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

# Predicting the Test set results
predictions_RF_val = classifier.predict(x_val)

# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_val, predictions_RF)

#Results
print_results(x_val, y_val, predictions_RF_val, classifier)


Area under precission-recall is: 0.45565005571591666
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.89      0.91      5813
          1       0.41      0.57      0.48       814

avg / total       0.87      0.85      0.86      6627


 Confusion matrix:
  [[5148  665]
 [ 346  468]]


True negatives: 5148
False positives: 665
False negatives: 346
True positives:  468


In [16]:
# Feature Scaling, because it isn't needed for Random Forest and Decision treefa
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
x_val = sc.transform(x_val)

In [17]:
#Logistic Regression Algorithm
logdown = LogisticRegression(class_weight = 'balanced')
logdown.fit(x_train, y_train)
predictions_LR = logdown.predict(x_val)

#Results
print_results(x_val, y_val, predictions_LR, logdown)


Area under precission-recall is: 0.45719880242425104
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.83      0.89      5805
          1       0.34      0.63      0.44       784

avg / total       0.87      0.81      0.83      6589


 Confusion matrix:
  [[4842  963]
 [ 293  491]]


True negatives: 4842
False positives: 963
False negatives: 293
True positives:  491


In [18]:
#SVM algorithm

svm_classifier = SVC(kernel = 'linear', random_state = 0, probability = True)
svm_classifier.fit(x_train, y_train)
predictions_SVC_val = svm_classifier.predict(x_val)

#Results
print_results(x_val, y_val, predictions_SVC_val, svm_classifier)


Area under precission-recall is: 0.4334487567606329
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.94      0.92      0.93      5805
          1       0.47      0.54      0.50       784

avg / total       0.88      0.87      0.88      6589


 Confusion matrix:
  [[5324  481]
 [ 362  422]]


True negatives: 5324
False positives: 481
False negatives: 362
True positives:  422


In [19]:
#KNN Algorithm

knn_classifier = KNeighborsClassifier(n_neighbors=4, metric = 'minkowski', p = 2)
knn_classifier.fit(x_train, y_train)

predictions_KNN_val = knn_classifier.predict(x_val)

#Results
print_results(x_val, y_val, predictions_KNN_val, knn_classifier)

#predictions_KNN_test = knn_classifier.predict(x_test)
#print("Test set predictions:\n\n" + classification_report(y_test, predictions_KNN_test))


Area under precission-recall is: 0.3928402495515153
Validation set predictions: 

             precision    recall  f1-score   support

          0       0.92      0.91      0.91      5813
          1       0.40      0.43      0.41       814

avg / total       0.86      0.85      0.85      6627


 Confusion matrix:
  [[5281  532]
 [ 462  352]]


True negatives: 5281
False positives: 532
False negatives: 462
True positives:  352


In [None]:
# Gaussian Naive Bayes
gaussian_classifier = GaussianNB()
gaussian_classifier.fit(x_train, y_train)
#print(gaussian_classifier)
# make predictions
predicted = gaussian_classifier.predict(x_val)
# summarize the fit of the model

print_results(x_val, y_val, predicted, gaussian_classifier)