In [None]:
from IPython.display import display, HTML
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

import numpy as np
import warnings


from datetime import datetime
import glob
import io, os , sys, types
import tabulate
import copy

import random

import matplotlib.pyplot as plt
%matplotlib inline
import itertools

from sklearn.model_selection import train_test_split
from sklearn import linear_model
#from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB 
from sklearn import datasets
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import tree
from sklearn.externals.six import StringIO
from sklearn.ensemble import RandomForestClassifier
import pydotplus

from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import chi2_contingency

from helper_functions import *

import seaborn as sns
sns.set(color_codes=True)

fontsz = 12

# ROC Curve and Cutoff Analysis:
# https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/One_ROC_Curve_and_Cutoff_Analysis.pdf

## Loading the data

In [None]:
# load the dataset
fname_germancredit = r'dataset/German.Credit.csv'
data_raw = pd.read_csv(fname_germancredit)

In [None]:
col_target = 'class'
cols_numeric = list(data_raw.describe().columns.values)
cols_categoric = list(set(data_raw.columns.values) - set(cols_numeric) - set([col_target]))

### Exploratory Data Analysis

In [None]:
# data describe


In [None]:
# display categoric columns


In [None]:
# Contingency table (crosstab)


In [None]:
# Contingency table, ratios. Rows add-up to 100%


In [None]:
# Contingency table, ratios. Columns add-up to 100%


In [None]:
# calculate and print p-value from contingency table


In [None]:
# Do so for all predictors


### Descriptive Statistics for Numerical Predictors

### Correlation Matrix
Correlation between numeric variables. Plot the correlation matrix

In [None]:
data_numeric = data_raw[cols_numeric].copy(deep=True)
corr_mat = data_numeric.corr(method='pearson')
cbar_ticks =np.linspace(-1,1,11)
cmap = sns.diverging_palette(220, 10, as_cmap=True)
plt.figure(figsize=[8,8])
plt.xticks(fontsize=fontsz+2)
plt.yticks(fontsize=fontsz+2)
ax = sns.heatmap(corr_mat, cmap=cmap, vmin=-1, vmax=1, square=True, linewidths=.5, cbar_kws={"shrink": .5})
cbar = ax.collections[0].colorbar
cbar.set_ticks(cbar_ticks)
cbar.set_ticklabels(cbar_ticks)
plt.show()

In [None]:
# print the correlation matrix (just the numbers, not a figure)


### Histograms 

### Boxplot

### Preprocessing
Creating dummy-variables

In [None]:
# Replace categorical variables with dummy-variables


In [None]:
# Replace ‘bad’ and ‘good’ class labels with 0 and 1, before continuing with the exercise


### Model Evaluation

In [None]:
# Random seed
seed = 1017
random.seed(seed)

In [None]:
# Split the data using the function, train_test_split()
frac_train = 0.8 # 80% of the data is used for training
X_train, X_test, y_train, y_test = \
    train_test_split(data_raw[cols_features], data_raw[col_target], test_size=(1-frac_train), random_state=seed)
    
train_b = sum(y_train == 0)
train_g = sum(y_train == 1)
test_b = sum(y_test == 0)
test_g = sum(y_test == 1)
print ("Class ratios between each set:")
print ("Trainset")
print ("\t\tNormal class (good):", 100*train_g/len(y_train), "%\t", "Target class (bad):", 100*train_b/len(y_train),"%")
print ("Testset")
print ("\t\tNormal class (good):", 100*test_g/len(y_test), "%\t", "Target class (bad):", 100*test_b/len(y_test),"%")

In [None]:
# Set Misclassification loss weights


### Logistic Regression Model
More about Logistic Regression examples in python can be found here:<br>
They are using a different (and more informative) logistic-regression package<br>
http://blog.yhat.com/posts/logistic-regression-python-rodeo.html

In [None]:
# 1. Train the model


In [None]:
mse = np.mean(y_train - model.predict(X_train)) ** 2
print ("Mean Square Error: ", mse)

In [None]:
# 2. Display the obtained model along with most relevant statistics


In [None]:
# 3. Test the model


In [None]:
# 4. Draw ROC Curve and calculate AUC


In [None]:
# 5. Calculate the total misclassification loss


In [None]:
# 6. Build the confusion matrix for the tests data for both the default and optimal thresholds


In [None]:
# 7. Plot the confusion matrices


In [None]:
# [Optional]: Plot the misclassification-loss vs threshold


## Naive Bayes Model

In [None]:
# 1. Train the model and apply predictions


In [None]:
# 2. Display the obtained model along with most relevant statistics


In [None]:
# 3. Draw ROC Curve and calculate AUC


In [None]:
# 4. Calculate the total misclassification loss


In [None]:
# 5. Build the confusion matrix for the tests data for both the default and optimal thresholds


In [None]:
# 6. Plot the confusion matrices


## Decision Trees
Decision Trees is a recursive-repartitioning technique, which is used to recursively split the data in order to create nodes that are<br>
purer. A pure node is a node that consists of only 1-class of those existing in the data.<br>
In our context, a pure node would be composed of either all-"bad" or all-"good" classes.<br>
The advantages of DT is that it produces rules that are easy to follow, and human-readable, in contrast to other "black-box" algorithms, such as Random-Forest<br>
DTs however, are prone to overfitting, which is why we need to use some parameters to avoid such behavior.<br>
As with __Logistic Regression__, __DT__s also require categorical features to be dummified.<br>
1. Based on what we discussed, can you offer an intuition about why DTs tend to overfit?
2. [Advanced] Can you offer some ways to avoid overfitting?

In [None]:
# 1. Train the model and apply predictions
md = 18                    # maximum tree depth
mf = len(cols_features)    # maximum number of features to consider
min_leaf = 10
criterion = 'entropy'
model = tree.DecisionTreeClassifier(max_depth=md, max_features=mf, criterion=criterion, 
                                    min_samples_leaf=min_leaf, random_state=seed)

clf = model.fit(X_train, y_train)
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)[:, 1]

In [None]:
# 2. Display the obtained model along with most relevant statistics


In [None]:
# Visualize the tree
##write_Tree('dataset', clf, cols_features) # can be used only for small trees (<=4)

In [None]:
# 3. Draw ROC Curve and calculate AUC


In [None]:
# 4. Calculate the total misclassification loss


In [None]:
# 5. Build the confusion matrix for the tests data for both the default and optimal thresholds


In [None]:
# 6. Display the confusion matrices


## Random Forest
Random Forest is an ensemble learning classification method, which utilizes multiple decision-trees,<br>
and a voting mechanism in order to classify each sample.

In [None]:
# 1. Train the model and apply predictions


In [None]:
# 2. Display the obtained model along with most relevant statistics


In [None]:
# 3. Draw ROC Curve and calculate AUC


In [None]:
# 4. Calculate the total misclassification loss


In [None]:
# 5. Build the confusion matrix for the tests data for both the default and optimal thresholds


In [None]:
# 6. Display the confusion matrices


In [None]:
print("You're done! Give yourself a pat on the back! You've earned a break. Take 5.")