# Exercise 19: A Decision Tree in Scikit-Learn

In [None]:
import numpy as np #numerical computation
import pandas as pd #data wrangling
import matplotlib.pyplot as plt #plotting package
#Next line helps with rendering plots
%matplotlib inline
import matplotlib as mpl #add'l plotting functionality
mpl.rcParams['figure.dpi'] = 400 #high res figures
import graphviz #to visualize decision trees

In [None]:
df = pd.read_csv('../Data/Chapter_1_cleaned_data.csv')

In [None]:
features_response = df.columns.tolist()

In [None]:
items_to_remove = ['ID', 'SEX', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
                   'EDUCATION_CAT', 'graduate school', 'high school', 'none',
                   'others', 'university']

In [None]:
features_response = [item for item in features_response if item not in items_to_remove]
features_response

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree

In [None]:
X_train, X_test, y_train, y_test = \
train_test_split(df[features_response[:-1]].values, df['default payment next month'].values,
test_size=0.2, random_state=24)

In [None]:
dt = tree.DecisionTreeClassifier(max_depth=2)

In [None]:
dt.fit(X_train, y_train)

Need to install graphviz: https://stackoverflow.com/questions/33433274/anaconda-graphviz-cant-import-after-installation

In [None]:
# This saves a .dot file to disk, that could be turned in to an image file
# tree.export_graphviz(dt, out_file='../../../Drafts/Chapter 5/Graphics/Ex_1_3.dot',
#                      filled=True, rounded=True,
#                      feature_names=features_response[:-1],
#                      proportion=True, class_names=['Not defaulted', 'Defaulted']) 

In [None]:
dot_data = tree.export_graphviz(dt, out_file=None,
                                filled=True, rounded=True,
                                feature_names=features_response[:-1],
                                proportion=True, class_names=['Not defaulted', 'Defaulted']) 

In [None]:
# dot_data

In [None]:
graph = graphviz.Source(dot_data) 
graph

In [None]:
features_response[:-1].index('PAY_1')

In [None]:
X_train.shape

In [None]:
sum(X_train[:,4] <= 1.5)/X_train.shape[0]

In [None]:
np.mean(y_train)

In [None]:
dt.max_depth = None

In [None]:
dt.fit(X_train, y_train)

In [None]:
# # This saves a .dot file to disk, that could be turned in to an image file
# tree.export_graphviz(dt, out_file='../../../Drafts/Chapter 5/Graphics/Ex_1_8.dot',
#                      filled=True, rounded=True,
#                      feature_names=features_response[:-1],
#                      proportion=True, class_names=['Not defaulted', 'Defaulted']) 

# Training Decision Trees: Node Impurity

In [None]:
pm0 = np.linspace(0.01,0.99,99)
pm1 = 1 - pm0

In [None]:
pm0

In [None]:
pm1

In [None]:
misclassification_rate = np.minimum(pm0, pm1)

In [None]:
misclassification_rate

In [None]:
mpl.rcParams['figure.dpi'] = 400
plt.plot(pm0, misclassification_rate, label='Misclassification rate')
plt.xlabel('$p_{m0}$')
plt.legend()

In [None]:
gini = (pm0 * (1-pm0)) + (pm1 * (1-pm1))

In [None]:
mpl.rcParams['figure.dpi'] = 400
plt.plot(pm0, misclassification_rate, label='Misclassification rate')
plt.plot(pm0, gini, label='Gini impurity')
plt.xlabel('$p_{m0}$')
plt.legend()

In [None]:
cross_ent = -1*( (pm0 * np.log(pm0)) + (pm1 * np.log(pm1)) )

In [None]:
mpl.rcParams['figure.dpi'] = 400
plt.plot(pm0, misclassification_rate, label='Misclassification rate')
plt.plot(pm0, gini, label='Gini impurity')
plt.plot(pm0, cross_ent, label='Cross entropy')
plt.xlabel('$p_{m0}$')
plt.legend()

# Using Decision Trees: Advantages and Predicted Probabilities
Based on https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [None]:
from sklearn.datasets import make_circles

In [None]:
X_circ, y_circ = make_circles(n_samples = 300, shuffle=True,
                             noise=0.1, random_state=1, factor=0.4)

In [None]:
from matplotlib.colors import ListedColormap

In [None]:
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

In [None]:
ax = plt.axes()
ax.scatter(X_circ[:,0], X_circ[:,1], c=y_circ, cmap=cm_bright)
ax.set_aspect('equal')
ax.set_title('Nonlinear data for classification')
ax.set_xticks([])
ax.set_yticks([])

In [None]:
x_min, x_max = X_circ[:, 0].min() - .5, X_circ[:, 0].max() + .5
y_min, y_max = X_circ[:, 1].min() - .5, X_circ[:, 1].max() + .5
h = 0.02
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
classifiers = [tree.DecisionTreeClassifier(max_depth=4, random_state=4),
              LogisticRegression()]
titles = ['Decision tree', 'Logistic regression']

In [None]:
counter = 1
for classif in classifiers:
    ax = plt.subplot(1, 2, counter)
    classif.fit(X_circ, y_circ)
    Z = classif.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
    ax.scatter(X_circ[:,0], X_circ[:,1], c=y_circ, cmap=cm_bright)
    ax.set_aspect('equal')
    ax.set_title(titles[counter-1])
    ax.set_xticks([])
    ax.set_yticks([])
    counter+=1

# Exercise 20: Finding Optimal Hyperparameters for a Decision Tree

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
X_train.shape

In [None]:
params = {'max_depth':[1, 2, 4, 6, 8, 10, 12]}
# params = {'max_depth':list(range(1,13))}

In [None]:
cv = GridSearchCV(dt, param_grid=params, scoring='roc_auc',
                  n_jobs=None, refit=True, cv=4, verbose=1,
                  pre_dispatch=None, error_score=np.nan, return_train_score=True)

In [None]:
cv.fit(X_train, y_train)

In [None]:
# cv.cv_results_

In [None]:
cv_results_df = pd.DataFrame(cv.cv_results_)

In [None]:
cv_results_df

In [None]:
cv_results_df.columns

In [None]:
ax = plt.axes()
ax.errorbar(cv_results_df['param_max_depth'],
            cv_results_df['mean_train_score'],
            yerr=cv_results_df['std_train_score'],
            label='Mean $\pm$ 1 SD training scores')
ax.errorbar(cv_results_df['param_max_depth'],
            cv_results_df['mean_test_score'],
            yerr=cv_results_df['std_test_score'],
            label='Mean $\pm$ 1 SD testing scores')
ax.legend()
plt.xlabel('max_depth')
plt.ylabel('ROC AUC')

In [None]:
cv_results_df.max()

# Exercise 21: Fitting a Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier\
(n_estimators=10, criterion='gini', max_depth=3,
min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
 bootstrap=True, oob_score=False, n_jobs=None,
random_state=4, verbose=0, warm_start=False, class_weight=None)

In [None]:
rf_params_ex = {'n_estimators':list(range(10,110,10))}

In [None]:
cv_rf_ex = GridSearchCV(rf, param_grid=rf_params_ex, scoring='roc_auc', fit_params=None,
                        n_jobs=None, iid=False, refit=True, cv=4, verbose=1,
                        pre_dispatch=None, error_score=np.nan, return_train_score=True)

In [None]:
cv_rf_ex.fit(X_train, y_train)

In [None]:
cv_rf_ex_results_df = pd.DataFrame(cv_rf_ex.cv_results_)

In [None]:
cv_rf_ex_results_df

In [None]:
cv_rf_ex_results_df.max()

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(6, 3))
axs[0].plot(cv_rf_ex_results_df['param_n_estimators'],
            cv_rf_ex_results_df['mean_fit_time'],
            '-o')
axs[0].set_xlabel('Number of trees')
axs[0].set_ylabel('Mean fit time (seconds)')
axs[1].errorbar(cv_rf_ex_results_df['param_n_estimators'],
                cv_rf_ex_results_df['mean_test_score'],
                yerr=cv_rf_ex_results_df['std_test_score'])
axs[1].set_xlabel('Number of trees')
axs[1].set_ylabel('Mean testing ROC AUC $\pm$ 1 SD ')
plt.tight_layout()

In [None]:
cv_rf_ex.best_params_

In [None]:
feat_imp_df = pd.DataFrame({
    'Feature name':features_response[:-1],
    'Importance':cv_rf_ex.best_estimator_.feature_importances_
})

In [None]:
feat_imp_df.sort_values('Importance', ascending=False)

# Checkerboard Graph

In [None]:
xx_example, yy_example = np.meshgrid(range(5), range(5))
print(xx_example)
print(yy_example)

In [None]:
z_example = np.arange(1,17).reshape(4,4)
z_example

In [None]:
ax = plt.axes()
pcolor_ex = ax.pcolormesh(xx_example, yy_example, z_example, cmap=plt.cm.jet)
plt.colorbar(pcolor_ex, label='Color scale')
ax.set_xlabel('X coordinate')
ax.set_ylabel('Y coordinate')

# Activity 5: Cross-Validation Grid Search with Random Forest

In [None]:
rf_params = {'max_depth':[3, 6, 9, 12],
             'n_estimators':[10, 50, 100, 200]}

In [None]:
cv_rf = GridSearchCV(rf, param_grid=rf_params, scoring='roc_auc', fit_params=None,
                  n_jobs=-1, iid=False, refit=True, cv=4, verbose=2,
                  error_score=np.nan, return_train_score=True)

In [None]:
cv_rf.fit(X_train, y_train)

In [None]:
cv_rf_results_df = pd.DataFrame(cv_rf.cv_results_)

In [None]:
cv_rf_results_df

In [None]:
cv_rf_results_df.max()

Looks like, from max_depth = 3, 6, 9, 12... 9 is best. The more trees the better.

In [None]:
xx_rf, yy_rf = np.meshgrid(range(5), range(5))

In [None]:
cm_rf = plt.cm.jet

In [None]:
ax_rf = plt.axes()
pcolor_graph = ax_rf.pcolormesh(xx_rf, yy_rf, cv_rf_results_df['mean_test_score'].values.reshape((4,4)), cmap=cm_rf)
plt.colorbar(pcolor_graph, label='Average testing ROC AUC')
ax_rf.set_aspect('equal')
ax_rf.set_xticks([0.5, 1.5, 2.5, 3.5])
ax_rf.set_yticks([0.5, 1.5, 2.5, 3.5])
ax_rf.set_xticklabels([str(tick_label) for tick_label in rf_params['n_estimators']])
ax_rf.set_yticklabels([str(tick_label) for tick_label in rf_params['max_depth']])
ax_rf.set_xlabel('Number of trees')
ax_rf.set_ylabel('Maximum depth')

In [None]:
cv_rf.best_params_

In [None]:
feat_imp_df_act = pd.DataFrame({
    'Feature name':features_response[:-1],
    'Importance':cv_rf.best_estimator_.feature_importances_
})

In [None]:
feat_imp_df_act.sort_values('Importance', ascending=False)