In [None]:
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import warnings


from datetime import datetime
import glob
import io, os , sys, types
import tabulate
import copy

import random

import matplotlib.pyplot as plt
%matplotlib inline
import itertools

from sklearn.model_selection import train_test_split
from sklearn import linear_model
#from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB 
from sklearn import datasets
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import tree
from sklearn.externals.six import StringIO
from sklearn.ensemble import RandomForestClassifier
import pydotplus

from scipy import stats
from scipy.stats import pearsonr

from helper_functions import *

import seaborn as sns
sns.set(color_codes=True)

fontsz = 12

# ROC Curve and Cutoff Analysis:
# https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/One_ROC_Curve_and_Cutoff_Analysis.pdf

In [None]:
# load the dataset
fname_germancredit = r'dataset/German.Credit.csv'
data_raw = pd.read_csv(fname_germancredit)

In [None]:
# REMOVE COLUMNS HERE:
if False:
    cols_to_remove = ['residence_since', 'num_dependents', 'existing_credits']
    for i in cols_to_remove:
        data_raw = data_raw.drop(i, 1) # dropping the column

In [None]:
col_target = 'class'
cols_numeric = list(data_raw.describe().columns.values)
cols_categoric = list(set(data_raw.columns.values) - set(cols_numeric) - set([col_target]))

#### Data exploatory analysis
It is important that before going overboard with algorithms, we do some exploration of the data, and understand what we see before us.<br>
This can save us a lot of time in the long-run. Excercise: Can you think examples why exploration of the data is important?

In [None]:
data_raw.describe()

In [None]:
# Contingency table
pd.crosstab(data_raw['class'], data_raw['account_balance'],  margins=True)

In [None]:
# Small on purpose, but feel free to play around with it at home
data_raw.hist(figsize=(10,5))
plt.show()

In [None]:
# Correlation between numeric variables
data_numeric = data_raw[cols_numeric].copy(deep=True)
corr_mat = data_numeric.corr(method='pearson')
cbar_ticks =np.linspace(-1,1,11)
cmap = sns.diverging_palette(220, 10, as_cmap=True)
plt.figure(figsize=[8,8])
plt.xticks(fontsize=fontsz+2)
plt.yticks(fontsize=fontsz+2)
ax = sns.heatmap(corr_mat, cmap=cmap, vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
cbar = ax.collections[0].colorbar
cbar.set_ticks(cbar_ticks)
cbar.set_ticklabels(cbar_ticks)
plt.show()

In [None]:
brksCredits = np.linspace(0,80,11) # Bins for a nice looking histogram
plt.figure(figsize=(10,5))
plt.hist(data_raw['duration'], bins=brksCredits)
plt.title('duration', fontsize=fontsz+4)
plt.xlabel('Loan Period [Months]', fontsize=fontsz+2)
plt.ylabel('Count', fontsize=fontsz+2)
plt.xticks(fontsize=fontsz)
plt.yticks(fontsize=fontsz)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
plt.boxplot(data_raw['duration']) 
plt.title('duration boxplot', fontsize=fontsz+4)
plt.xlabel('Credit Month', fontsize=fontsz+2) 
plt.ylabel('duration', fontsize=fontsz+2)
plt.xticks(fontsize=fontsz)
plt.yticks(fontsize=fontsz)
plt.show()

#### Preprocessing
1. What is the purpose of preprocessing?
2. Give 1-2 examples of common preprocessing tasks
3. Certain features don't always have data. Some of them may be null, or simply have 'no_data' entry.<br>Discuss: What do __YOU__ think we should do in such cases? Do null, or 'no_data' entries require any special preprocessing? <br>If so, can you suggest some? If not, why?

For convinience, we'll replace the class-labels with 0 and 1. This makes it easier for the classification models to process<br>
<font color='red'>__'bad'__</font> --> 0
<font color='green'>__'good'__</font> --> 1<br>

In [None]:
data_raw['class'].replace('bad', 0, inplace=True)
data_raw['class'].replace('good', 1, inplace=True)
data_raw['class'] = pd.to_numeric(data_raw['class'])

#### Dummy Variables
1. What are dummy variables?<br>
2. Why do we use dummy variables?

In [None]:
print ("Number of columns before dummy-variables:\t", len(data_raw.columns.values))
for i in cols_categoric:
    dummy_ranks = pd.get_dummies(data_raw[i], prefix=i)
    data_raw = data_raw.join(dummy_ranks)
    data_raw = data_raw.drop(i, 1) # dropping the original categoric column (not needed - it was replaced by dummy columns)
    
cols_features = list(set(data_raw.columns.values) - set([col_target])) # all feature, numeric, and categoric (now dummified)
print ("Number of columns after dummy-variables:\t", len(data_raw.columns.values))

#### Random Seed
1. What is a random seed?<br>
2. What would be the result of using the same seed for the experiment?<br>
3. Why is it important to use random seeds in an experiment?

#### Train and test sets
Python has a convinient function to split the dataset into train and test<br>
1. Look at the following example and discuss: is the below data-split optimal? Why?<br>
2. What is a balanced dataset, and when is it important to use such datasets?

In [None]:
seed = 1017
frac_train = 0.8
X_train, X_test, y_train, y_test = train_test_split(data_raw[cols_features], data_raw[col_target], test_size=(1-frac_train), random_state=seed)

In [None]:
train_b = sum(y_train == 0)
train_g = sum(y_train == 1)
test_b = sum(y_test == 0)
test_g = sum(y_test == 1)
print ("Class ratios between each set:")
print ("Trainset")
print ("\t\tNormal class (good):", 100*train_g/len(y_train), "%\t", "Target class (bad):", 100*train_b/len(y_train),"%")
print ("Testset")
print ("\t\tNormal class (good):", 100*test_g/len(y_test), "%\t", "Target class (bad):", 100*test_b/len(y_test),"%")

### Logistic Regression Model
More about Logistic Regression examples in python can be found here:<br>
http://blog.yhat.com/posts/logistic-regression-python-rodeo.html

In [None]:
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

#### Model coefficients
The coefficients show what logistic regression learned to be the strongest features. The larger the abs(coefficient), the strongest the feature

In [None]:
PRINT_COEFFS = False
model_coefficients = model.coef_[0]
# calculating p_values
p_values =[]
for i in range(0,len(X_train.columns.values)):
    slope, intercept, r_value, p_value, std_err = stats.linregress(np.array(X_train.iloc[:,i]), np.array(y_train))
    p_values.append(p_value)
    
if PRINT_COEFFS:
    print ("prediction = ",end="")
    for i in range(0,len(cols_features)):
        print ("+("+str(round(model_coefficients[i],5))+") *\t<<"+str(cols_features[i])+">>")
        print ("\t\t",end="")

In [None]:
df_lgm_coeffs = pd.DataFrame(data=[list(cols_features), list(model_coefficients), list(p_values)]).transpose()
df_lgm_coeffs.columns = ['feature', 'LGM_coeff', 'p_value']

# sort by absolute coefficient value
df_lgm_coeffs = df_lgm_coeffs.reindex(df_lgm_coeffs['LGM_coeff'].abs().sort_values(inplace=False, ascending=False).index)
# create figure
plt.figure(figsize=[15,5])
horiz_line_data = np.array([0 for i in range(0, len(df_lgm_coeffs))])
plt.plot(range(0,len(df_lgm_coeffs)), horiz_line_data, 'b--') # adding horizontal line at 0
x=range(0,len(df_lgm_coeffs))
plt.plot(x,df_lgm_coeffs['LGM_coeff'], 'ro')  
plt.xticks(x, df_lgm_coeffs['feature'], rotation='vertical')
plt.xticks(fontsize=fontsz-2)
plt.yticks(fontsize=fontsz)
plt.title('LGM Coefficients (sorted by absolute values)', fontsize=fontsz+4)
plt.show()

1. What is the difference between predict and predict_prob?
2. When would we prefer one over the other?

In [None]:
# sort by p_value value
numeric_pvalues = df_lgm_coeffs[df_lgm_coeffs['feature'].isin(cols_numeric)].copy(deep=True)
numeric_pvalues.sort_values(['p_value'], inplace=True)
plt.figure(figsize=[25,5])
horiz_line_data = np.array([0 for i in range(0, len(numeric_pvalues))])
plt.plot(range(0,len(numeric_pvalues)), horiz_line_data, 'b--') # adding horizontal line at 0
x=range(0,len(numeric_pvalues))
plt.plot(x,numeric_pvalues['p_value'], 'ro')  
plt.xticks(x, numeric_pvalues['feature'], rotation='vertical')
plt.xticks(fontsize=fontsz+4)
plt.yticks(fontsize=fontsz+4)
plt.title('p-values (sorted ascending)', fontsize=fontsz+4)
plt.show()

In [None]:
HINT = False
if HINT:
    df_lgm_coeffs = df_lgm_coeffs.reindex(df_lgm_coeffs['p_value'].abs().sort_values(inplace=False, ascending=False).index)
    df_lgm_coeffs[df_lgm_coeffs['p_value'] < 0.1]

    ### TO DO
    # lab: use the p-value to determine the features that seem to be most relevant (use p_value <= 0.10) for all *numerical* features
    # remove 'residence_since', 'num_dependents', 'existing_credits'
    x = df_lgm_coeffs[df_lgm_coeffs['feature'].isin(cols_numeric)]
    x[x['p_value'] > 0.10]
    


In [None]:
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)[:,1]

#### ROC/AUC: Reciever Operating Characteristic, Area-Under-Curve

In [None]:
fpr, tpr, _ = metrics.roc_curve(np.array(y_test), predicted_prob)
auc = metrics.auc(fpr,tpr)
print ("Area-Under-Curve:", round(auc,4))
# plot_ROC() is defined in helper_functions.py
plot_ROC(fpr,tpr, fontsz, 'Receiver operating characteristic for Logistic Regression Model') 

1. What does the ROC curve tell us?
2. When would we want to use ROC to communicate our results?<br> Hint: try to think of cases where ROC-curve would be misleading :)

### About thresholds (aka "Calculating Optimal Threshold")
We will compare models using "Optimal Threshold". The optimal threshold (OT) is the threshold we select where the confusion matrix<br>
gives optimal results, with respect to some loss function (in our case, the loss function depends on false calssifications: false positive, and negative).<br>
We will compare models by using the results from their optimal thresholds<br>
<br>
Our __decision threshold__ is prediction: for x >= prediction, we __classify__ that the sample is of the positive class (1)<br>
for x < prediction, we classify the sample as the negative class (0)<br><br>
For convinience of presenting the result we show it by using (1-prediction-threshold).<br>
It helps to think of that as __the top probable__ classifications. For example, the __top__ 0.1 most probable, is anything where the probability >=0.9<br><br>
loss matrix can be found in helper_functions.py. It helps to calculate the optimal threshold of our model<br>
for more reading, please refer to the document __"One_ROC_Curve_and_Cutoff_Analysis.pdf"__ in your course reading material<br>
we use p. #7, but the other pages are interesting too!! :)<br><br>
### Loss Function
The loss function helps us to select the best threshold, per problem requirements (domain knowledge)<br><br>
$Loss$ = $C_0$ + $C_{TP}$ $\cdot$ $P$($TP$) + $C_{TN}$ $\cdot$ $P$($TN$) + $C_{FP}$ $\cdot$ $P$($FP$) + $C_{FN}$ $\cdot$ $P$($FN$) 
<br><br>
In our example we'll use a simplified version that accounts only for for misclassifications: FN and FP

In [None]:
def_cfm = metrics.confusion_matrix(y_test, predicted) # default confusion matrix, default threshold = 0.5

# Set Misclassification loss weights
c_tn = 0 # weight of true-negative
c_tp = 0 # weight of true-positive
c_fn = 1 # weight of false negative
c_fp = 5 # weight of false positive

# finding the optimal values using the TRAIN-SET
train_predicted_prob = model.predict_proba(X_train)[:,1]
loss_matrix = calculate_loss(train_predicted_prob, y_train, c_fn, c_fp, c_tp, c_tn) 
# finding optimal threshold:
opt_thr = list(loss_matrix[loss_matrix['loss'] == loss_matrix['loss'].min()]['prediction'])[0]
loss = loss_matrix['loss'].min()
predicted_prob_opt = copy.deepcopy(predicted_prob)
predicted_prob_opt[predicted_prob_opt >  opt_thr] = 1
predicted_prob_opt[predicted_prob_opt <= opt_thr] = 0
opt_cfm = metrics.confusion_matrix(y_test, predicted_prob_opt)

# For the first example only, we'll plot the loss function vs threshold
plt.figure(figsize=(10,5), facecolor='white')
plt.plot(loss_matrix['prediction'], loss_matrix['loss'], 'o-')
plt.title('Loss function for various thresholds and confusion matrices', fontsize=fontsz+4)
plt.xlabel('Threshold', fontsize=fontsz+4)
plt.ylabel('Loss', fontsize=fontsz+4)
plt.show()

In [None]:
print("This may be a little hard to remember at first. Note the ordering of the confusion matrix:\n")
cfm_convention()

In [None]:
plot_confusion_matrix(def_cfm,['bad', 'good'], "Default Confusion Matrix", 0)
plot_confusion_matrix(opt_cfm,['bad', 'good'], "Loss-Optimized Confusion Matrix", 1)
plt.show()
print("Optimal threshold:\t",round(opt_thr,4))

In [None]:
columns=['Classifier', 'AUC', 'LOSS']
compare_models = pd.DataFrame( columns=columns)
model_perf = pd.DataFrame( columns=columns, index=[0], data=[['Logistic Regression',auc,loss]])
compare_models = compare_models.append(model_perf, ignore_index=True)

## Naive Bayes Model

In [None]:
gnb = BernoulliNB()
model = gnb.fit(X_train, y_train)
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)[:, 1]

In [None]:
fpr, tpr, _ = metrics.roc_curve(np.array(y_test), predicted_prob)
auc = metrics.auc(fpr,tpr)
print ("Area-Under-Curve:", round(auc,4))
plot_ROC(fpr,tpr, fontsz, 'Receiver operating characteristic for Naive-Bayes Model') 

In [None]:
def_cfm = metrics.confusion_matrix(y_test, predicted)

# finding the optimal values using the TRAIN-SET
train_predicted_prob = model.predict_proba(X_train)[:, 1]
loss_matrix = calculate_loss(train_predicted_prob, y_train, c_fn, c_fp, c_tp, c_tn) 

# finding optimal threshold:
opt_thr = list(loss_matrix[loss_matrix['loss'] == loss_matrix['loss'].min()]['prediction'])[0]
loss = loss_matrix['loss'].min()
predicted_prob_opt = copy.deepcopy(predicted_prob)
predicted_prob_opt[predicted_prob_opt >  opt_thr] = 1
predicted_prob_opt[predicted_prob_opt <= opt_thr] = 0
opt_cfm = metrics.confusion_matrix(y_test, predicted_prob_opt)

plot_confusion_matrix(def_cfm,['bad', 'good'], "Default Confusion Matrix", 0)
plot_confusion_matrix(opt_cfm,['bad', 'good'], "Loss-Optimized Confusion Matrix", 1)
plt.show()
print("Optimal threshold:\t",round(opt_thr,4))

In [None]:
model_perf = pd.DataFrame( columns=columns, index=[0], data=[['Naive-Bayes',auc,loss]])
compare_models = compare_models.append(model_perf, ignore_index=True)

### Linear Discriminant Analysis

Python Linear Discriminant Analysis (LDA) requires all variables to be float-type

In [None]:
##X_train_org = X_train.copy(deep=True)
##X_test_org = X_test.copy(deep=True)
for i in X_train.columns.values:
    X_train[i] = X_train[i].astype(float)
for i in X_test.columns.values:
    X_test[i] = X_test[i].astype(float)

In [None]:
clf1 = LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None, solver='lsqr', store_covariance=False, tol=0.0001)
clf1.fit(X_train, y_train)
predicted = clf1.predict(X_test)
predicted_prob = clf1.predict_proba(X_test)[:, 1]

In [None]:
fpr, tpr, _ = metrics.roc_curve(np.array(y_test), predicted_prob)
auc = metrics.auc(fpr,tpr)
print ("Area-Under-Curve:", round(auc,4))

plot_ROC(fpr,tpr, fontsz, 'Receiver operating characteristic for LDA Model') 

In [None]:
def_cfm = metrics.confusion_matrix(y_test, predicted)

# finding the optimal values using the TRAIN-SET
train_predicted_prob = clf1.predict_proba(X_train)[:,1]
loss_matrix = calculate_loss(train_predicted_prob, y_train, c_fn, c_fp, c_tp, c_tn) 

# finding optimal threshold:
opt_thr = list(loss_matrix[loss_matrix['loss'] == loss_matrix['loss'].min()]['prediction'])[0]
loss = loss_matrix['loss'].min()
predicted_prob_opt = copy.deepcopy(predicted_prob)
predicted_prob_opt[predicted_prob_opt >  opt_thr] = 1
predicted_prob_opt[predicted_prob_opt <= opt_thr] = 0
opt_cfm = metrics.confusion_matrix(y_test, predicted_prob_opt)

plot_confusion_matrix(def_cfm,['bad', 'good'], "Default Confusion Matrix", 0)
plot_confusion_matrix(opt_cfm,['bad', 'good'], "Loss-Optimized Confusion Matrix", 1)
plt.show()
print("Optimal threshold:\t",round(opt_thr,4))

In [None]:
model_perf = pd.DataFrame( columns=columns, index=[0], data=[['Linear Discriminant Analysis',auc,loss]])
compare_models = compare_models.append(model_perf, ignore_index=True)

In [None]:
# cleanup
##X_train = X_train_org.copy(deep=True)
##X_test = X_test_org.copy(deep=True)

## SVM - Support Vector Machine

SVM is sensitive to the size of the dataset, and might not scale good<br>
This is since each new sample requires additional __n__ calculations (where __n__ is the size of the input)<br>
This means that SVM ~ O(__n__^2). There are methods to drive this number down, but as a rule of thumb, it<br>
doesn't scale very well 'out-of-the-box'<br>
Linear-SVM is a relatively cheap (faster) version of SVM, which is not that different from Logistic Regression<br>
In this example, we'll run __Linear SVM__<br>
The LSVM itself would have gone faster, but the probability calculation we are forcing is also computationally<br>
expensive. So, after you execute this part of the program, go grab a cup of tea. You earned it!

In [None]:
RUN_SVM = False # simply takes too much time. As an excercise, try it at home 

In [None]:
if RUN_SVM:
    from sklearn import svm
    import time

    print ("Executing SVM. This is going to take a while, so go grab a cup of tea. No, not coffee. Tea! Preferably, Earl-Grey. Have lunch.")
    start_time = time.time()
    
    # Setting probability to False will speed things up, but we'll lose the threshold calculation
    clf = svm.SVC(probability=True, C=1.0, cache_size=8000, kernel='linear') 
    print (clf.fit(X_train, y_train))
    print (time.time() - start_time)

    print ("Running prediction ")
    predicted = clf.predict(X_test)
    predicted_prob = clf.predict_proba(X_test)[:,1]

    fpr, tpr, _ = metrics.roc_curve(np.array(y_test), predicted_prob)
    auc = metrics.auc(fpr,tpr)
    print ("Area-Under-Curve:", round(auc,4))

    plot_ROC(fpr,tpr, fontsz, 'Receiver operating characteristic for SVM Model') 
    
    def_cfm = metrics.confusion_matrix(y_test, predicted)

    # finding the optimal values using the TRAIN-SET
    train_predicted_prob = clf.predict_proba(X_train)[:,1]
    loss_matrix = calculate_loss(train_predicted_prob, y_train, c_fn, c_fp, c_tp, c_tn) 
    loss = loss_matrix['loss'].min()
    
    # finding optimal threshold:
    opt_thr = list(loss_matrix[loss_matrix['loss'] == loss_matrix['loss'].min()]['prediction'])[0]
    predicted_prob_opt = copy.deepcopy(predicted_prob)
    predicted_prob_opt[predicted_prob_opt >  opt_thr] = 1
    predicted_prob_opt[predicted_prob_opt <= opt_thr] = 0
    opt_cfm = metrics.confusion_matrix(y_test, predicted_prob_opt)

    plot_confusion_matrix(def_cfm,['bad', 'good'], "Default Confusion Matrix", 0)
    plot_confusion_matrix(opt_cfm,['bad', 'good'], "Loss-Optimized Confusion Matrix", 1)
    plt.show()
    print("Optimal threshold:\t",round(opt_thr,4))
    model_perf = pd.DataFrame( columns=columns, index=[0], data=[['Support Vector Machine',auc,loss]])
    compare_models = compare_models.append(model_perf, ignore_index=True)
    

## Decision Trees

Decision Trees is a recursive-repartitioning technique, which is used to recursively split the data in order to create nodes that are<br>
purer. A pure node is a node that consists of only 1-class of those existing in the data.<br>
In our context, a pure node would be composed of either all-"bad" or all-"good" classes.<br>
The advantages of DT is that it produces rules that are easy to follow, and human-readable, in contrast to other "black-box" algorithms, such as Random-Forest<br>
DTs however, are prone to overfitting, which is why we need to use some parameters to avoid such behavior.<br>
As with __Logistic Regression__, __DT__s also require categorical features to be dummified.<br>
1. Based on what we discussed, can you offer an intuition about why DTs tend to overfit?
2. [Advanced] Can you offer some ways to avoid overfitting?

In [None]:
#md = len(cols_features)    # maximum tree depth
md = 3                     # maximum tree depth
mf = len(cols_features)    # maximum number of features to consider
min_leaf = 20
criterion = 'entropy'
model = tree.DecisionTreeClassifier(max_depth=md, max_features=mf, criterion=criterion, 
                                    min_samples_leaf=min_leaf, random_state=seed)
clf = model.fit(X_train, y_train)

##### Writing rules to disk, as "if-then-else" statements

In [None]:
# Write rules to file
# REMEMBER: Rules refer ONLY to the TRAINING data!
fname_DT_rules = r'dataset\DT_rules_output.txt'
old_stdout = sys.stdout
sys.stdout = mystdout = StringIO()
get_code(clf, cols_features) # get_code() is a function from helper_functions.py
sys.stdout = old_stdout
# end capture
to_text = mystdout.getvalue()
# write file
text_file = open(fname_DT_rules, "w")
text_file.write(to_text)
text_file.close()

# Using pyplotplus and graphviz (both must be installed on the computer for this bit to work) in order to visualize the decision tree
PYPLOT_INST = False
GRAPHVIZ_INST = False

if (PYPLOT_INST & GRAPHVIZ_INST): # ERROR!! there's some bug with deep trees (max_depth > 5)
    outfile= r'dataset\tree_01.dot'
    pngfile= r'dataset\tree_01.png'
    dot_data = StringIO()  
    tree.export_graphviz(clf, out_file=outfile,  
                         feature_names=cols_features,  
                         class_names=['Bad', 'Good'],  
                         filled=True, rounded=True,  
                         special_characters=True)  

    graph = pydotplus.graph_from_dot_file(outfile)
    graph.write_png(pngfile)
    
else: 
    print ("Skipping on visualization")


In [None]:
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(np.array(y_test), predicted_prob)
auc = metrics.auc(fpr,tpr)
print ("Area-Under-Curve:", round(auc,4))
plot_ROC(fpr,tpr, fontsz, "Receiver operating characteristic for Decision Tree Model") 

In [None]:
def_cfm = metrics.confusion_matrix(y_test, predicted)

# finding the optimal values using the TRAIN-SET
train_predicted_prob = clf.predict_proba(X_train)[:,1]
loss_matrix = calculate_loss(train_predicted_prob, y_train, c_fn, c_fp, c_tp, c_tn) 

# finding optimal threshold:
opt_thr = list(loss_matrix[loss_matrix['loss'] == loss_matrix['loss'].min()]['prediction'])[0]
loss = loss_matrix['loss'].min()
predicted_prob_opt = copy.deepcopy(predicted_prob)
predicted_prob_opt[predicted_prob_opt >  opt_thr] = 1
predicted_prob_opt[predicted_prob_opt <= opt_thr] = 0
opt_cfm = metrics.confusion_matrix(y_test, predicted_prob_opt)

plot_confusion_matrix(def_cfm,['bad', 'good'], "Default Confusion Matrix", 0)
plot_confusion_matrix(opt_cfm,['bad', 'good'], "Loss-Optimized Confusion Matrix", 1)
plt.show()
print("Optimal threshold:\t",round(opt_thr,4))

In [None]:
model_perf = pd.DataFrame( columns=columns, index=[0], data=[['Decision Tree',auc,loss]])
compare_models = compare_models.append(model_perf, ignore_index=True)

## Random Forest
Random Forest is an ensemble learning classification method, which utilizes multiple decision-trees,<br>
and a voting mechanism in order to classify each sample.

In [None]:
clf = RandomForestClassifier(max_depth=md, max_features=mf, criterion=criterion, min_samples_leaf = min_leaf, random_state=seed)
clf = clf.fit(X_train, y_train)

In [None]:
predicted = clf.predict(X_test)
predicted_prob = clf.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = metrics.roc_curve(np.array(y_test), predicted_prob)
auc = metrics.auc(fpr,tpr)
print ("Area-Under-Curve:", round(auc,4))

plot_ROC(fpr,tpr, fontsz, "Receiver operating characteristic for Random Forest Model") # function is defined in helper_functions.py

In [None]:
def_cfm = metrics.confusion_matrix(y_test, predicted)

# finding the optimal values using the TRAIN-SET
train_predicted_prob = clf.predict_proba(X_train)[:,1]
loss_matrix = calculate_loss(train_predicted_prob, y_train, c_fn, c_fp, c_tp, c_tn) 
loss = loss_matrix['loss'].min()

# finding optimal threshold:
opt_thr = list(loss_matrix[loss_matrix['loss'] == loss_matrix['loss'].min()]['prediction'])[0]
loss = loss_matrix['loss'].min()
predicted_prob_opt = copy.deepcopy(predicted_prob)
predicted_prob_opt[predicted_prob_opt >  opt_thr] = 1
predicted_prob_opt[predicted_prob_opt <= opt_thr] = 0
opt_cfm = metrics.confusion_matrix(y_test, predicted_prob_opt)

plot_confusion_matrix(def_cfm,['bad', 'good'], "Default Confusion Matrix", 0)
plot_confusion_matrix(opt_cfm,['bad', 'good'], "Loss-Optimized Confusion Matrix", 1)
plt.show()
print("Optimal threshold:\t",round(opt_thr,4))

In [None]:
model_perf = pd.DataFrame( columns=columns, index=[0], data=[['Random Forest',auc,loss]])
compare_models = compare_models.append(model_perf, ignore_index=True)

In [None]:
print (tabulate.tabulate(compare_models, headers='keys', tablefmt='psql'))

In [None]:
#thr = opt_thr
#pptest = copy.deepcopy(predicted_prob)
#pptest[pptest >  thr] = 1
#pptest[pptest <= thr] = 0

#print (sum((y_test ==0) & (pptest == 0) ))  # TN
#print (sum((y_test ==0) & (pptest == 1) ))  # FP
#print (sum((y_test ==1) & (pptest == 0) ))  # FN
#print (sum((y_test ==1) & (pptest == 1) ))  # TP

In [None]:
# Marie Curie, Erwin Schrodinger, Enrico Fermi, Robert Oppenheimer, Albert Einstein
# [18671107, 18870812, 19010929, 19040422, 18790314]