<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

#  Decision Trees & Random Forests


First install Pydot: `$ conda install pydot`

In [None]:
import csv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Step One: Load in the data!

In [None]:
with open('../datasets/titanic.csv', 'r') as csvfile:
    titanic_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # header contains feature names
    row = next(titanic_reader)
    feature_names = np.array(row)
    
    # load dataset, and target classes
    titanic_X, titanic_y = [], []
    for row in titanic_reader:  
        titanic_X.append(row)
        titanic_y.append(row[0]) # The target value is "survived"
    
    # changing to arrays
    titanic_X = np.array(titanic_X)
    titanic_y = np.array(titanic_y)

In [None]:
# inspect last row, header, features and target
print(f'Last Row: {row}')
print(f'Header: {feature_names}')
print(f'First Row: {titanic_X[0]}')
print(f'Target: {titanic_y[0]}')

In [None]:
# keep the 'class', 'age' and 'sex' variables (2nd, 5th, and 4th features)
titanic_X = titanic_X[:, [1, 4, 3]]
feature_names = feature_names[[1, 4, 3]]
feature_names

## Step Two: Preprocess Data

In [None]:
ages = titanic_X[:, 1]
titanic_X[ages != '', 1]

In [None]:
# 'age' contains missing values, so we're going to assign the mean value 
#   (for all of the elements in 'age')
ages = titanic_X[:, 1]
# convert strings to floats then average
mean_age = np.mean(titanic_X[ages != '', 1].astype(np.float))

# update 'age' column
titanic_X[titanic_X[:, 1] == '', 1] = mean_age

In [None]:
## encode 'sex' as a categorical variable
# normalize our class variables by giving them easily interpreted labels

from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

# create categorical classes for 'sex' (the 3rd variable)
label_encoder = enc.fit(titanic_X[:, 2])

print("Categorical classes:", label_encoder.classes_)

In [None]:
# create numerical classes for 'sex'
integer_classes = label_encoder.transform(label_encoder.classes_)
print("Integer classes:", integer_classes)

# update 'sex' column
titanic_X[:, 2] = label_encoder.transform(titanic_X[:, 2])

In [None]:
# inspect one row of X and y
print(feature_names, 'target')
print(titanic_X[5], titanic_y[5])

In [None]:
# now encode 'class', which has more than 2 possible values
from sklearn.preprocessing import OneHotEncoder

# create categorical classes for 'class' (the 1st variable)
enc = LabelEncoder()
label_encoder = enc.fit(titanic_X[:, 0])

print("Categorical classes:", label_encoder.classes_)

# create numerical classes for 'class'
integer_classes = label_encoder.transform(label_encoder.classes_)
print("Integer classes:", integer_classes)

enc = OneHotEncoder(categories='auto')
one_hot_encoder = enc.fit(integer_classes.reshape(-1, 1))

In [None]:
# first, convert classes to integers using label_encoder
t = label_encoder.transform(titanic_X[:, 0])
t[:10]

In [None]:
# second, create a sparse matrix with three columns, 
#   each one indicating if the instance belongs to the class
new_features = one_hot_encoder.transform(t.reshape(-1, 1))
new_features

In [None]:
# add the new features to titanic_X
titanic_X = np.concatenate([titanic_X, new_features.toarray()], axis = 1)

# delete converted column (now redundant)
titanic_X = np.delete(titanic_X, [0], 1)

In [None]:
# update feature names
feature_names = ['age', 'sex', 'first_class', 'second_class', 'third_class']

# convert to numerical values
titanic_X = titanic_X.astype(float)
titanic_y = titanic_y.astype(float)

# inspect
print(feature_names)
print(titanic_X[0], titanic_y[0])

## Step Three: Create Split!
<img src='https://media.giphy.com/media/LWQVG2QJwUgla/giphy.gif' style="float: center; height: 150px">

In [None]:
# create training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titanic_X, titanic_y, test_size=0.25, random_state=33)

## Step Four: Decision Trees!

In [None]:
from sklearn import tree

# instantiate tree using entropy to measure information gain
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
# fit the decision tree with the data
clf = clf.fit(X_train, y_train)

In [None]:
# visualize our trained tree, using pydot
import pydot
import os
from io import StringIO
dot_data = StringIO() 

# export to file via graphviz
tree.export_graphviz(clf, out_file=dot_data, feature_names=['age','sex','1st_class','2nd_class','3rd_class']) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
# write to pdf
graph[0].write_pdf('titanic.pdf')

print('Image created!')
print('Check it out here:', os.getcwd())

Notice that we just **created a pdf** in our working directory.
- Navigate to it through the Finder window and *check it out*!

## Step Five: Measure Performance

> **_Recall_**,
> - **Precision** = true positives/(true positives + false positives)  
> i.e. The ability of the classifier to not label a negative sample as positive
> - **Recall** = true positives/(true positives + false negatives)  
> i.e. The ability of the classifier to find all positive samples
> - **f1**  = 2 * (precision * recall) / (precision + recall)

In [None]:
# create function to measure model performance
from sklearn import metrics

def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, 
                        show_confusion_matrix=True):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print("Accuracy: {0:.3f}".format(metrics.accuracy_score(y, y_pred)), "\n")
    if show_classification_report:
        print("Classification report:")
        print(metrics.classification_report(y, y_pred), "\n")
    if show_confusion_matrix:
        print("Confusion matrix:")
        print(metrics.confusion_matrix(y, y_pred), "\n")

# measure accuracy, precision, recall, f1 in the training set
measure_performance(X_train, y_train, clf, show_classification_report=True, 
                    show_confusion_matrix=True)

In [None]:
# perform leave-one-out cross validation to better measure performance, reducing variance
from sklearn.model_selection import cross_val_score, LeaveOneOut
from scipy.stats import sem

# inspect documentation for LeaveOneOut
# help(LeaveOneOut)

In [None]:
# create function to perform Leave-One-Out cross validation
def loo_cv(X_train, y_train, clf):
    loo = LeaveOneOut()
    scores = np.zeros(X_train[:].shape[0])
    
    for train_index,test_index in loo.split(X_train):
        X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
        y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
        clf = clf.fit(X_train_cv, y_train_cv)
        y_pred = clf.predict(X_test_cv)
        scores[test_index] = metrics.accuracy_score(y_test_cv.astype(int), 
                                                    y_pred.astype(int))
    
    print("Mean score: {0:.3f} (+/-{1:.3f})".format(np.mean(scores), sem(scores)))

loo_cv(X_train, y_train, clf)

## Step Six: Improve the Model

In [None]:
# first, try for better results using Random Forests
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, random_state=33)
loo_cv(X_train, y_train, clf)

In [None]:
## now, go back to the decision tree model and try tuning hyperparameters
# attempt 1
clf_dt = tree.DecisionTreeClassifier(criterion='entropy', 
                                     max_depth=3,
                                     min_samples_leaf=5)
clf_dt.fit(X_train, y_train)

measure_performance(X_test, y_test, clf_dt)

# Inspect documentation for DecisionTreeClassifier
# help(tree.DecisionTreeClassifier)

In [None]:
# attempt 2
clf_dt = tree.DecisionTreeClassifier(criterion='gini', 
                                     max_depth=3,
                                     min_samples_leaf=10)
clf_dt.fit(X_train,y_train)
measure_performance(X_test,y_test,clf_dt)

### A New Measure: the ROC and Area Under a Curve (AUC)

One way we can score a binary classification is by plotting the *reciever operating characteristic (ROC)* and determining the value of the *area under curve (AUC)*. 
- Like above, our goal is to see an _**AUC** as close to 1 as possible_.

In [None]:
# first, we use metrics to measure roc curve
# syntax: roc_curve(actual, prediction, [pos_label if it's not 1])
predictions = [p[1] for p in clf_dt.predict_proba(X_train)]
fpr, tpr, thresholds = metrics.roc_curve(y_train, predictions)

# next, measure the auc
roc_auc = metrics.auc(fpr, tpr)

In [None]:
plt.figure()
lw = 2

plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
print("False-positive rate:", fpr)
print("True-positive rate: ", tpr)
print("Thresholds:         ", thresholds)

metrics.roc_auc_score(y_train, predictions)

## Extra Practice!

+ Change some of the assumptions we've made throughout the lab to see how that changes the accuracy; Imputation, tree depth, samples, etc.

+ Try to find the most accurate model you can; talk about what you did, address the bias-variance tradeoff.