## I'm still working on this notebook ...

##  Data Exploration

In [11]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
data = pd.read_csv('./bank.csv', header=0)

In [None]:
#Drop the columns where at least one element is missing.
#df.dropna(axis='columns')

#Drop the rows where all elements are missing.
#df.dropna(how='all')

#Keep only the rows with at least 2 non-NA values.
#df.dropna(thresh=2)

In [None]:
data = data.dropna() #Drop the rows where at least one element is missing.
print(data.shape)
print(list(data.columns))

In [None]:
#The education column of the dataset has many categories and we need to reduce 
#the categories for a better modelling. 
#The education column has the following categories:
data['education'].unique()
#Let us group “basic.4y”, “basic.9y” and “basic.6y” together and call them “basic”.

data['education']=np.where(data['education'] =='basic.9y', 'Basic', data['education'])
data['education']=np.where(data['education'] =='basic.6y', 'Basic', data['education'])
data['education']=np.where(data['education'] =='basic.4y', 'Basic', data['education'])


In [None]:
#Predict variable (desired target)
#y — has the client subscribed a term deposit? (binary: “1”, means “Yes”, “0” means “No”)

#Barplot for the dependent variable
sns.countplot(x='y',data=data, palette='hls')
plt.show()


In [None]:
#Check the missing values
data.isnull().sum()

In [None]:
#Customer job distribution
sns.countplot(y="job", data=data)
plt.show()

In [None]:
#Customer marital status distribution
sns.countplot(x="marital", data=data)
plt.show()

In [None]:
#Barplot for credit in default
sns.countplot(x="default", data=data)
plt.show()

In [None]:
#Barplot for housing loan
sns.countplot(x="housing", data=data)
plt.show()

In [None]:
#Barplot for personal loan
sns.countplot(x="loan", data=data)
plt.show()

In [None]:
#Barplot for previous marketing campaign outcome
sns.countplot(x="poutcome", data=data)
plt.show()

In [None]:
data.drop(data.columns[[0, 3, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19]], axis=1, inplace=True)

## Create dummy variables

In [None]:
#Data Preprocessing
#Create dummy variables, that is variables with only two values, zero and one.
#In logistic regression models, encoding all of the independent variables as dummy variables allows easy interpretation and calculation of the odds ratios, and increases the stability and significance of the coefficients.

data2 = pd.get_dummies(data, 
                       columns =['job', 'marital', 'default', 'housing', 'loan', 'poutcome'])

In [None]:
#Drop the unknown columns
data2.drop(data2.columns[[12, 16, 18, 21, 24]], axis=1, inplace=True)
data2.columns

In [None]:
#Split the data into training and test sets

X = data2.iloc[:,1:]
y = data2.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Imputing missing values before building an estimator

This example shows that imputing the missing values can give better results than discarding the samples containing any missing value. Imputing does not always improve the predictions, so please check via cross-validation. Sometimes dropping rows or using marker values is more effective.

Missing values can be replaced by the mean, the median or the most frequent value using the strategy hyper-parameter. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail').



## Hyperparameters search

## Logistic Regression Model

In [None]:
#Fit logistic regression to the training set

classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)


In [None]:
#Predicting the test set results and creating confusion matrix
#The confusion_matrix() function will calculate a confusion matrix and return the result as an array.

y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

# Cross Validation

Cross validation attempts to avoid overfitting while still producing a prediction for each observation dataset. We are using 10-fold Cross-Validation to train our Logistic Regression model.

In [None]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
Classifier visualization playground
The purpose of this section is to visualize logistic regression classsifiers’ decision boundaries. In order to better vizualize the decision boundaries, we’ll perform Principal Component Analysis (PCA) on the data to reduce the dimensionality to 2 dimensions.


In [None]:

from sklearn.decomposition import PCA
X = data2.iloc[:,1:]
y = data2.iloc[:,0]
pca = PCA(n_components=2).fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(pca, y, random_state=0)

plt.figure(dpi=120)
plt.scatter(pca[y.values==0,0], pca[y.values==0,1], alpha=0.5, label='YES', s=2, color='navy')
plt.scatter(pca[y.values==1,0], pca[y.values==1,1], alpha=0.5, label='NO', s=2, color='darkorange')
plt.legend()
plt.title('Bank Marketing Data Set\nFirst Two Principal Components')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.gca().set_aspect('equal')
plt.show()