In [20]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [35]:
import graphviz
from graphviz import Graph

In [22]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

In [23]:
df = pd.read_csv('titanic_df.csv')

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [25]:
dummy1_df = pd.get_dummies(df[['embark_town']],drop_first=True)
dummy2_df = pd.get_dummies(df[['class']],drop_first=True)
df["is_female"] = df.sex == "Female"

In [26]:
df = pd.concat([df,dummy1_df,dummy2_df], axis=1)

df = df.drop(columns=['Unnamed: 0','sex', 'pclass','class', 'embarked','embark_town','deck'])


In [27]:
df.dropna(inplace=True)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   passenger_id             714 non-null    int64  
 1   survived                 714 non-null    int64  
 2   age                      714 non-null    float64
 3   sibsp                    714 non-null    int64  
 4   parch                    714 non-null    int64  
 5   fare                     714 non-null    float64
 6   alone                    714 non-null    int64  
 7   is_female                714 non-null    bool   
 8   embark_town_Queenstown   714 non-null    uint8  
 9   embark_town_Southampton  714 non-null    uint8  
 10  class_Second             714 non-null    uint8  
 11  class_Third              714 non-null    uint8  
dtypes: bool(1), float64(2), int64(5), uint8(4)
memory usage: 48.1 KB


In [29]:
train, validate, test = split(df, stratify_by="survived")

In [30]:
train.shape, validate.shape, test.shape

((399, 12), (172, 12), (143, 12))

In [31]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [32]:
df.survived.value_counts()

0    424
1    290
Name: survived, dtype: int64

## Exercise 1. 
### What is your baseline prediction? What is your baseline accuracy? 


In [33]:
train["most_frequent"] = 0
baseline_accuracy = (train.survived == train.most_frequent).mean()
print(f'My baseline prediction is survived = 0')
print(f'My baseline accuracy is: {baseline_accuracy:.2%}')

My baseline prediction is survived = 0
My baseline accuracy is: 59.40%


## Exercise 2.
### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [86]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)

In [87]:
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=2, random_state=123)

In [88]:
y_pred = clf.predict(X_train)
y_pred[0:10]

array([0, 0, 1, 1, 1, 1, 1, 0, 1, 0])

In [89]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.78756477, 0.21243523],
       [0.78756477, 0.21243523],
       [0.45212766, 0.54787234],
       [0.45212766, 0.54787234],
       [0.45212766, 0.54787234]])

## Exercise 3 & 4.
### Evaluate your in-sample results using the model score, confusion matrix, and classification report.
### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [90]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.68


In [91]:
print(confusion_matrix(y_train, y_pred))

[[152  85]
 [ 41 121]]


In [92]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,152,85
1,41,121


In [93]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.64      0.71       237
           1       0.59      0.75      0.66       162

    accuracy                           0.68       399
   macro avg       0.69      0.69      0.68       399
weighted avg       0.71      0.68      0.69       399



In [94]:
[tn,fp],[fn, tp] = confusion_matrix(y_train, y_pred)

In [95]:
tp, tn, fp, fn

(121, 152, 85, 41)

## Exercise 5.
### Run through steps 2-4 using a different max_depth value

In [60]:
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

In [100]:
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=2, random_state=123)

In [None]:
import graphviz
from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format='pdf')

In [63]:
y_pred = clf.predict(X_train)
y_pred[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [83]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.78756477, 0.21243523],
       [0.78756477, 0.21243523],
       [0.45212766, 0.54787234],
       [0.45212766, 0.54787234],
       [0.45212766, 0.54787234]])

## Exercise 3 & 4.
### Evaluate your in-sample results using the model score, confusion matrix, and classification report.
### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [65]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.74


In [66]:
print(confusion_matrix(y_train, y_pred))

[[219  18]
 [ 87  75]]


In [67]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,219,18
1,87,75


In [68]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.92      0.81       237
           1       0.81      0.46      0.59       162

    accuracy                           0.74       399
   macro avg       0.76      0.69      0.70       399
weighted avg       0.75      0.74      0.72       399



In [69]:
[tn,fp],[fn, tp] = confusion_matrix(y_train, y_pred)

In [70]:
tp, tn, fp, fn

(75, 219, 18, 87)

## Exercise 6.
### Which model performs better on your in-sample data?

- **In this example max_depth of 4 appears to be a better fit than a max_depth of 2**

## Exercise 7.
### Which model performs best on your out-of-sample data, the validate set?

In [111]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)
clf = clf.fit(X_train, y_train)

In [112]:
clf.score(X_validate, y_validate)

0.6395348837209303

In [102]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.64


In [103]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 1])

In [104]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.64      0.68       102
           1       0.55      0.64      0.59        70

    accuracy                           0.64       172
   macro avg       0.64      0.64      0.63       172
weighted avg       0.65      0.64      0.64       172



- Max depth changed to 4

In [106]:
clf = DecisionTreeClassifier(max_depth=4, random_state=123)
clf = clf.fit(X_train, y_train)

In [107]:
clf.score(X_validate, y_validate)

0.7151162790697675

In [108]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.72


In [109]:
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [110]:
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.89      0.79       102
           1       0.74      0.46      0.57        70

    accuracy                           0.72       172
   macro avg       0.72      0.67      0.68       172
weighted avg       0.72      0.72      0.70       172



- **The model with max_depth of 4 still appears like a better model than one with max_depth of 2**