In [17]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

import seaborn as sns
import pandas as pd
import seaborn as sns


%matplotlib inline

bank_data = pd.read_csv('bank-additional-full.csv')

In [2]:
#Handling categorical variables

job = pd.get_dummies(bank_data['job'], drop_first = True)
marital = pd.get_dummies(bank_data['marital'], drop_first = True)
education = pd.get_dummies(bank_data['education'], drop_first = True)
default = pd.get_dummies(bank_data['default'], drop_first = True)
housing = pd.get_dummies(bank_data['housing'], drop_first = True)
loan =  pd.get_dummies(bank_data['loan'], drop_first = True)
contact = pd.get_dummies(bank_data['contact'], drop_first = True)
month = pd.get_dummies(bank_data['month'], drop_first = True)
day_of_week = pd.get_dummies(bank_data['day_of_week'], drop_first = True)
poutcome = pd.get_dummies(bank_data['poutcome'], drop_first = True)

#reduced set of features that is going to be concatenated to existing ones
bank_data_red = pd.concat([job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome], axis=1)

In [3]:
#Transformed bank_data set
bank_data_tran = bank_data_red.join(bank_data)

In [23]:
bank_data_tran.drop(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome'], axis = 1, inplace = True)
bank_data_tran.drop(['duration'], axis = 1, inplace = True)

#because of the high correlation coefficient between euribor3m rate and nr. of employed, the latter is going to be dropped
bank_data_tran.drop(['nr.employed'], axis = 1, inplace = True)

In [24]:
#Data preprocessing and splitting and scaling
x = bank_data_tran.iloc[:,bank_data_tran.columns != 'y'].values
y = bank_data_tran.iloc[:, -1].values

In [25]:
#Downsampled

# Separate majority and minority classes
df_majority = bank_data_tran[bank_data_tran.y=='no']
df_minority = bank_data_tran[bank_data_tran.y=='yes']

In [26]:
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=5000,     # to match minority class
                                 random_state=123) # reproducible results

In [27]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [28]:
df_downsampled.y.value_counts()

no     5000
yes    4640
Name: y, dtype: int64

In [29]:
#Data preprocessing and splitting and scaling
x_down = df_downsampled.iloc[:,df_downsampled.columns != 'y'].values
y_down = df_downsampled.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x_down, y_down, test_size = 0.2, random_state = 0)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

In [30]:
# Feature Scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
x_val = sc.transform(x_val)

In [31]:
#Logistic Regression Algorithm

logdown = LogisticRegression()
logdown.fit(x_train, y_train)
predictions_LR = logdown.predict(x_val)

print(classification_report(y_val,predictions_LR))

             precision    recall  f1-score   support

         no       0.69      0.87      0.77       771
        yes       0.83      0.61      0.70       772

avg / total       0.76      0.74      0.74      1543



In [32]:
#Random Forest Classification Algorithm

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 40, criterion = 'gini', random_state = 0)
classifier.fit(x_train, y_train)

# Predicting the Test set results
predictions_RF_val = classifier.predict(x_val)

# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_val, predictions_RF)

print("Validation set predictions: \n\n" + classification_report(y_val,predictions_RF_val))

predictions_RF_test = classifier.predict(x_test)

print("Test predictions:\n\n " + classification_report(y_test, predictions_RF_test))

Validation set predictions: 

             precision    recall  f1-score   support

         no       0.69      0.82      0.75       771
        yes       0.78      0.62      0.69       772

avg / total       0.73      0.72      0.72      1543

Test predictions:

              precision    recall  f1-score   support

         no       0.72      0.81      0.76       981
        yes       0.77      0.67      0.72       947

avg / total       0.74      0.74      0.74      1928



In [14]:
#SVM algorithm

svm_classifier = SVC(kernel = 'linear', random_state = 0)
svm_classifier.fit(x_train, y_train)

predictions_SVC_val = svm_classifier.predict(x_val)
print("Validation set predictions: \n\n" + classification_report(y_val,predictions_SVC_val))

predictions_SVC_test = svm_classifier.predict(x_test)
print("Test set predictions:\n\n" + classification_report(y_test, predictions_SVC_test))

Validation set predictions: 

             precision    recall  f1-score   support

         no       0.68      0.88      0.77       771
        yes       0.83      0.59      0.69       772

avg / total       0.75      0.73      0.73      1543

Test set predictions:

             precision    recall  f1-score   support

         no       0.70      0.86      0.77       981
        yes       0.81      0.62      0.70       947

avg / total       0.75      0.74      0.74      1928



In [21]:
#KNN Algorithm

knn_classifier = KNeighborsClassifier(n_neighbors=10, metric = 'minkowski', p = 2)
knn_classifier.fit(x_train, y_train)

predictions_KNN_val = knn_classifier.predict(x_val)
print("Validation set predictions: \n\n" + classification_report(y_val,predictions_KNN_val))

#predictions_KNN_test = knn_classifier.predict(x_test)
#print("Test set predictions:\n\n" + classification_report(y_test, predictions_KNN_test))

Validation set predictions: 

             precision    recall  f1-score   support

         no       0.66      0.84      0.74       771
        yes       0.78      0.58      0.66       772

avg / total       0.72      0.71      0.70      1543



In [18]:
#Decision tree algorithm

td_classifier = DecisionTreeClassifier(criterion='entropy', random_state = 0)
td_classifier.fit(x_train, y_train)

predictions_DT_val = td_classifier.predict(x_val)
print("Validation set predictions: \n\n" + classification_report(y_val,predictions_DT_val))

Validation set predictions: 

             precision    recall  f1-score   support

         no       0.66      0.69      0.67       771
        yes       0.67      0.64      0.66       772

avg / total       0.66      0.66      0.66      1543

