In [180]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [103]:
import graphviz
from graphviz import Graph

In [104]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

In [105]:
df = pd.read_csv('titanic_df.csv')

In [106]:
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [107]:
dummy1_df = pd.get_dummies(df[['embark_town']],drop_first=True)
dummy2_df = pd.get_dummies(df[['class']],drop_first=True)
df["is_female"] = (df.sex == "female")

In [108]:
df = pd.concat([df,dummy1_df,dummy2_df], axis=1)

df = df.drop(columns=['Unnamed: 0','passenger_id','age','sex', 'pclass','class', 'embarked','embark_town','deck','sibsp','parch'])


In [109]:
df.dropna(inplace=True)

In [110]:
df.head()

Unnamed: 0,survived,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
0,0,7.25,0,False,0,1,0,1
1,1,71.2833,0,True,0,0,0,0
2,1,7.925,1,True,0,1,0,1
3,1,53.1,0,True,0,1,0,0
4,0,8.05,1,False,0,1,0,1


In [111]:
train, validate, test = split(df, stratify_by="survived")

In [112]:
train.shape, validate.shape, test.shape

((498, 8), (214, 8), (179, 8))

In [113]:
train.columns, validate.columns

(Index(['survived', 'fare', 'alone', 'is_female', 'embark_town_Queenstown',
        'embark_town_Southampton', 'class_Second', 'class_Third'],
       dtype='object'),
 Index(['survived', 'fare', 'alone', 'is_female', 'embark_town_Queenstown',
        'embark_town_Southampton', 'class_Second', 'class_Third'],
       dtype='object'))

In [114]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [115]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

## Exercise 1. 
### What is your baseline prediction? What is your baseline accuracy? 


In [116]:
train["most_frequent"] = 0
baseline_accuracy = (train.survived == train.most_frequent).mean()
print(f'My baseline prediction is survived = 0')
print(f'My baseline accuracy is: {baseline_accuracy:.2%}')

My baseline prediction is survived = 0
My baseline accuracy is: 61.65%


## Exercise 2.
### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [166]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)

In [167]:
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=2, random_state=123)

In [168]:
y_pred = clf.predict(X_train)
y_pred[0:10]

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 1])

In [169]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.68644068, 0.31355932],
       [0.68644068, 0.31355932],
       [0.68644068, 0.31355932],
       [0.04255319, 0.95744681],
       [0.04255319, 0.95744681]])

## Exercise 3 & 4.
### Evaluate your in-sample results using the model score, confusion matrix, and classification report.
### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [170]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.80


In [171]:
print(confusion_matrix(y_train, y_pred))

[[265  42]
 [ 58 133]]


In [172]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,265,42
1,58,133


In [173]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       307
           1       0.76      0.70      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



In [174]:
[tn,fp],[fn, tp] = confusion_matrix(y_train, y_pred)

In [175]:
tp, tn, fp, fn

(133, 265, 42, 58)

In [177]:
tpr = (tp / (tp+fn))
fnr = (fn / (fn+tp))
tnr = (tn / (tn+fp))
fpr = (fp / (tn+fp))

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))
print(f'The True Positive Rate is: {tpr:.2%}')
print(f'The False Positive Rate is: {fpr:.2%}')
print(f'The True Negative Rate is: {tnr:.2%}')
print(f'The False Negative Rate is: {fnr:.2%}')

Accuracy of Decision Tree classifier on training set: 0.80
The True Positive Rate is: 69.63%
The False Positive Rate is: 13.68%
The True Negative Rate is: 86.32%
The False Negative Rate is: 30.37%


In [128]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree_2', view=True, format='pdf')

'titanic_decision_tree_2.pdf'

## Exercise 5.
### Run through steps 2-4 using a different max_depth value

In [155]:
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

In [156]:
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=4, random_state=123)

In [157]:
y_pred = clf.predict(X_train)
y_pred[0:10]

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0])

In [158]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.65909091, 0.34090909],
       [0.65909091, 0.34090909],
       [0.65909091, 0.34090909],
       [0.04878049, 0.95121951],
       [0.        , 1.        ]])

In [159]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.83


In [160]:
print(confusion_matrix(y_train, y_pred))

[[277  30]
 [ 57 134]]


In [161]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,277,30
1,57,134


In [162]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       307
           1       0.82      0.70      0.75       191

    accuracy                           0.83       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.83      0.82       498



In [163]:
[tn,fp],[fn, tp] = confusion_matrix(y_train, y_pred)

In [164]:
tp, tn, fp, fn

(134, 277, 30, 57)

In [165]:
tpr = (tp / (tp+fn))
fnr = (fn / (fn+tp))
tnr = (tn / (tn+fp))
fpr = (fp / (tn+fp))


print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))
print(f'The True Positive Rate is: {tpr:.2%}')
print(f'The False Positive Rate is: {fpr:.2%}')
print(f'The True Negative Rate is: {tnr:.2%}')
print(f'The False Negative Rate is: {fnr:.2%}')
                        

Accuracy of Decision Tree classifier on training set: 0.83
The True Positive Rate is: 70.16%
The False Positive Rate is: 9.77%
The True Negative Rate is: 90.23%
The False Negative Rate is: 29.84%


In [140]:

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree_4', view=True, format='pdf')

'titanic_decision_tree_4.pdf'

## Exercise 6.
### Which model performs better on your in-sample data?

- **In this example max_depth of 4 appears to be a better fit than a max_depth of 2**

## Exercise 7.
### Which model performs best on your out-of-sample data, the validate set?

In [141]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)
clf = clf.fit(X_train, y_train)

In [142]:
clf.score(X_validate, y_validate)

0.7616822429906542

In [143]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.76


In [144]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([1, 0, 0])

In [145]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       132
           1       0.70      0.66      0.68        82

    accuracy                           0.76       214
   macro avg       0.75      0.74      0.74       214
weighted avg       0.76      0.76      0.76       214



- Max depth changed to 4

In [146]:
clf = DecisionTreeClassifier(max_depth=4, random_state=123)
clf = clf.fit(X_train, y_train)

In [147]:
clf.score(X_validate, y_validate)

0.7850467289719626

In [148]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


In [149]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [150]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       132
           1       0.76      0.65      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76      0.77       214
weighted avg       0.78      0.79      0.78       214



- **The model with max_depth of 4 still appears like a better model than one with max_depth of 2**

In [179]:
validate.head()

Unnamed: 0,survived,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third
610,0,31.275,0,True,0,1,0,1
424,0,20.2125,0,False,0,1,0,1
568,0,7.2292,1,False,0,0,0,1
334,1,133.65,0,True,0,1,0,0
101,0,7.8958,1,False,0,1,0,1


# Random Forests / Bootstrapping

## Exercise 1. 
### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10

In [236]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=100, 
                            random_state=123)

In [237]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=100, random_state=123)

In [238]:
print(rf.feature_importances_)

[0.48805621 0.03647653 0.33229262 0.01358585 0.0293986  0.02195869
 0.0782315 ]


In [239]:
y_pred = rf.predict(X_train)

In [240]:
y_pred_proba = rf.predict_proba(X_train)

In [241]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.94
