In [None]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
import acquire
import prepare

import warnings
warnings.filterwarnings('ignore')

# Decision Tree

## Acquire Data

In [None]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.shape, validate.shape, test.shape

In [None]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

## Clean Data

In [None]:
X_train.head()

In [None]:
X_train.info()

In [None]:
def clean_data(df):
    '''
    This function will drop any duplicate observations, 
    drop ['deck', 'embarked', 'class', 'age'], fill missing embark_town with 'Southampton'
    and create dummy vars from sex and embark_town. 
    '''
    df = df.drop_duplicates()
    df = df.drop(columns=['embarked', 'class', 'age'])
    df['embark_town'] = df.embark_town.fillna(value='Southampton')
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    return df

In [None]:
X_train = clean_data(X_train)

In [None]:
X_train.head()

In [None]:
X_train = X_train.drop(['sex', 'embark_town'], axis=1)

In [None]:
y_train.head()

### What is your baseline prediction?

In [None]:
train.survived.value_counts()

As we can see above, the majority of passengers died, so our baseline assumption is that all passengers died.

### What is your baseline accuracy

Create the object.

In [None]:
model = DummyClassifier(strategy='constant', constant=0)

Fit the object.

In [None]:
model.fit(X_train, y_train)

In [None]:
accuracy = round(model.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

### Fit the decision tree classifier to your training sample and transform

In [195]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import graphviz
from graphviz import Graph
from sklearn import tree
import numpy
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
clf1 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
clf1 = clf1.fit(X_train, y_train)

In [None]:
clf1

In [None]:
dot_data = export_graphviz(clf1, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_tree', view=True)

In [None]:
y_pred = clf1.predict(X_train)

In [None]:
y_pred_proba = clf1.predict_proba(X_train)

In [None]:
y_probability = pd.DataFrame(y_pred_proba)
y_probability.head()

### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

**Accuracy Score**

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf1.score(X_train, y_train)))

**Confusion Matrix**

In [None]:
confusion_matrix(y_train, y_pred)

In [None]:
y_train.value_counts()

In [110]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,307,0
survived_actual,16,174


**Classification report**

In [None]:
report_train_depth_three = classification_report(y_train, y_pred, output_dict = True)

In [None]:
df_report_train_depth_three = pd.DataFrame(report_train_depth_three).T

In [None]:
df_report_train_depth_three

### Now we'll take a look at the validate model.

**Clean the data**

In [None]:
X_validate = clean_data(X_validate)

In [None]:
X_validate = X_validate.drop(['sex', 'embark_town'], axis=1)

**Accuracy Score**

In [49]:
y_pred_validate_depth_three = clf1.predict(X_validate)

In [51]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf1.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.79


**Confusion Matrix**

In [52]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_validate, y_pred_validate_depth_three), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,117,15
survived_actual,30,52


**Create a classification table**

In [53]:
report_validate_depth_three = classification_report(y_validate, y_pred_validate_depth_three, output_dict = True)

In [54]:
df_report_validate_depth_three = pd.DataFrame(report_validate_depth_three).T

In [55]:
df_report_validate_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.795918,0.886364,0.83871,132.0
1,0.776119,0.634146,0.697987,82.0
accuracy,0.78972,0.78972,0.78972,0.78972
macro avg,0.786019,0.760255,0.768348,214.0
weighted avg,0.788332,0.78972,0.784788,214.0


### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

**Accuracy**

In [56]:
model_accuracy = accuracy_score(y_train, y_pred)
model_accuracy

0.8108651911468813

**True Positive Rate**

In [57]:
true_positive = 128 
false_negative = 62
true_negative = 279
false_positive = 28

In [58]:
true_positive_rate = (true_positive)/(true_positive + false_negative)
true_positive_rate

0.6736842105263158

**False Positive Rate**

In [59]:
false_positive_rate = (false_positive)/(false_positive+true_negative)
false_positive_rate

0.09120521172638436

**True Negative Rate**

In [60]:
true_negative_rate = (true_negative)/(true_negative + false_positive)
true_negative_rate

0.9087947882736156

**False Negative Rate**

In [61]:
false_negative_rate = (false_negative)/(false_negative + true_positive)
false_negative_rate

0.3263157894736842

**Precision**

In [62]:
model_precision = precision_score(y_train, y_pred, pos_label= 1)
model_precision

0.8157894736842105

**Recall**

In [63]:
model_recall = recall_score(y_train, y_pred, pos_label = 1)
model_recall

0.6526315789473685

**f-1 score**

In [64]:
model_f1_score = f1_score(y_train, y_pred, average=None)
model_f1_score

array([0.85582822, 0.7251462 ])

**Support**

In [65]:
y_train.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [66]:
y_pred_df = pd.DataFrame(y_pred)

In [67]:
y_pred_df.head()

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,1


In [68]:
y0 = y_pred_df[y_pred_df[0] == 0]
y0.size

345

In [69]:
y1 = y_pred_df[y_pred_df[0] == 1]
y1.size

152

## Run through steps 2-4 using a different max_depth value.

### Training data

In [70]:
clf2 = DecisionTreeClassifier(max_depth=4, random_state=123)

In [71]:
clf2 = clf2.fit(X_train, y_train)

In [72]:
y_pred_train_depth_4 = clf2.predict(X_train)

In [73]:
y_pred_proba = clf2.predict_proba(X_train)

In [74]:
y_probability = pd.DataFrame(y_pred_proba)

**Training data accuracy score**

In [75]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.85


**Traininig data confusion matrix**

In [76]:
confusion_matrix(y_train, y_pred_train_depth_4)

array([[283,  24],
       [ 53, 137]])

In [77]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_train, y_pred_train_depth_4), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,283,24
survived_actual,53,137


**Training Data Classification Report**

In [78]:
classification_report_train_depth_four = classification_report(y_train, y_pred_train_depth_4, output_dict = True)

In [79]:
df_classification_report_train_depth_four = pd.DataFrame(classification_report_train_depth_four).T

In [80]:
df_classification_report_train_depth_four

Unnamed: 0,precision,recall,f1-score,support
0,0.842262,0.921824,0.880249,307.0
1,0.850932,0.721053,0.780627,190.0
accuracy,0.84507,0.84507,0.84507,0.84507
macro avg,0.846597,0.821438,0.830438,497.0
weighted avg,0.845576,0.84507,0.842164,497.0


### Validate Data

In [81]:
clf2 = clf2.fit(X_validate, y_validate)

In [82]:
y_pred_validate_depth_4 = clf2.predict(X_validate)

**Validate accuracy**

In [83]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
      .format(clf2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.86


**Validate confusion matrix**

In [84]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_validate, y_pred_validate_depth_4), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,125,7
survived_actual,24,58


**Training data classification report**

In [85]:
classification_report_validate_depth_4 = classification_report(y_validate, y_pred_validate_depth_4, output_dict = True)
df_classification_report_validate_depth_4 = pd.DataFrame(classification_report_validate_depth_4).T

In [86]:
df_classification_report_validate_depth_4

Unnamed: 0,precision,recall,f1-score,support
0,0.838926,0.94697,0.88968,132.0
1,0.892308,0.707317,0.789116,82.0
accuracy,0.85514,0.85514,0.85514,0.85514
macro avg,0.865617,0.827143,0.839398,214.0
weighted avg,0.859381,0.85514,0.851146,214.0


### Which model performs better on your in-sample data?

To answer this question, let's look at the training data classificatino reports side by side.

**Max depth three**

In [87]:
df_report_train_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.808696,0.908795,0.855828,307.0
1,0.815789,0.652632,0.725146,190.0
accuracy,0.810865,0.810865,0.810865,0.810865
macro avg,0.812243,0.780713,0.790487,497.0
weighted avg,0.811408,0.810865,0.805869,497.0


**Max depth four**

In [88]:
df_classification_report_train_depth_four

Unnamed: 0,precision,recall,f1-score,support
0,0.842262,0.921824,0.880249,307.0
1,0.850932,0.721053,0.780627,190.0
accuracy,0.84507,0.84507,0.84507,0.84507
macro avg,0.846597,0.821438,0.830438,497.0
weighted avg,0.845576,0.84507,0.842164,497.0


Max depth three out performs Max depth four on every observable measure.

### Which model performs best on your out-of-sample data, the validate set?

**Max depth three**

In [89]:
df_report_validate_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.795918,0.886364,0.83871,132.0
1,0.776119,0.634146,0.697987,82.0
accuracy,0.78972,0.78972,0.78972,0.78972
macro avg,0.786019,0.760255,0.768348,214.0
weighted avg,0.788332,0.78972,0.784788,214.0


**Max depth four**

In [90]:
df_classification_report_validate_depth_4

Unnamed: 0,precision,recall,f1-score,support
0,0.838926,0.94697,0.88968,132.0
1,0.892308,0.707317,0.789116,82.0
accuracy,0.85514,0.85514,0.85514,0.85514
macro avg,0.865617,0.827143,0.839398,214.0
weighted avg,0.859381,0.85514,0.851146,214.0


Max depth three outperforms Max depth three for the validate data sets.

# Random Forest

### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [92]:
from sklearn.ensemble import RandomForestClassifier

In [95]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

**Fit the model**

In [96]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

**Feature importance**

In [100]:
pd.DataFrame(rf.feature_importances_)

Unnamed: 0,0
0,0.158897
1,0.162192
2,0.099323
3,0.04761
4,0.031066
5,0.195731
6,0.029676
7,0.007023
8,0.01326
9,0.233667


**Make predictions**

In [101]:
y_pred = rf.predict(X_train)

In [103]:
pd.DataFrame(y_pred)

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,1
...,...
492,0
493,0
494,0
495,0


**Estimate probability**

In [104]:
y_pred_proba = rf.predict_proba(X_train)

In [105]:
pd.DataFrame(y_pred_proba)

Unnamed: 0,0,1
0,0.845507,0.154493
1,0.021714,0.978286
2,0.977273,0.022727
3,0.010714,0.989286
4,0.100000,0.900000
...,...,...
492,0.942332,0.057668
493,0.906272,0.093728
494,0.799098,0.200902
495,0.921866,0.078134


### Evaluate your results using the model score, confusion matrix, and classification report.

**Compute Accuracy**

In [106]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


**Create a confusion matrix**

In [111]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predicted', 'survived_predicted']
df_confusion_matrix = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels1, columns=labels2)

In [112]:
df_confusion_matrix

Unnamed: 0,died_predicted,survived_predicted
died_actual,307,0
survived_actual,16,174


**Create a classification report**

In [113]:
classification_report = classification_report(y_train, y_pred, output_dict = True)

In [116]:
df_classification_report = pd.DataFrame(classification_report).T

In [117]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.950464,1.0,0.974603,307.0
1,1.0,0.915789,0.956044,190.0
accuracy,0.967807,0.967807,0.967807,0.967807
macro avg,0.975232,0.957895,0.965324,497.0
weighted avg,0.969402,0.967807,0.967508,497.0


### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [128]:
true_positives = 174 
false_positives = 0 
true_negatives = 307
false_negatives = 16

**Accuracy**

In [118]:
model_accuracy = accuracy_score(y_train, y_pred)
model_accuracy

0.9678068410462777

**True positive rate**

In [136]:
true_positive_rate = (true_positives)/(true_positives + false_negatives)

In [137]:
true_positive_rate

0.9157894736842105

**False negative rate**

In [138]:
false_negative_rate = (false_negatives)/(false_negatives + true_positives)

In [139]:
false_negative_rate

0.08421052631578947

**True negative rate**

In [140]:
true_negative_rate = (true_negatives)/(true_negatives + false_positives)

In [141]:
true_negative_rate

1.0

**False Posititve  rate**

In [143]:
false_positive_rate = (false_positives)/(false_positives + true_negatives)

In [144]:
false_positive_rate

0.0

**Precision rate**

In [150]:
precision = precision_score(y_train, y_pred)

In [151]:
precision

1.0

**Recall**

In [152]:
recall = recall_score(y_train, y_pred)

In [153]:
recall

0.9157894736842105

**F-1 score**

In [154]:
f1 = f1_score(y_train, y_pred)

In [155]:
f1

0.956043956043956

**Support**

In [159]:
support_1 = true_positives + false_negatives 
support_0 = false_positives + true_negatives
support_total = true_positives + false_negatives + false_positives + true_negatives

In [165]:
print(f'Support 1: {support_1}')
print(f'Support 0: {support_0}')
print(f'Support total: {support_total}')

Support 1: 190
Support 0: 307
Support total: 497


### Run through steps increasing your min_samples_leaf and decreasing your max_depth.

This time, let's set min_samples_leaf to 3, and max_depth to 6.

In [168]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=6, 
                            random_state=123)

**Fit the model**

In [186]:
rf2.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=3, random_state=123)

**Feature importance**

In [187]:
pd.DataFrame(rf2.feature_importances_)

Unnamed: 0,0
0,0.095989
1,0.096242
2,0.133684
3,0.047151
4,0.026138
5,0.169599
6,0.040284
7,0.00478
8,0.013097
9,0.355629


**Make predictions**

In [193]:
y_pred = rf2.predict(X_train)
pd.DataFrame(y_pred)

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,1
...,...
492,0
493,0
494,0
495,0


**Estimate probability**

In [175]:
y_pred_proba = rf2.predict_proba(X_train)
pd.DataFrame(y_pred_proba)

Unnamed: 0,0,1
0,0.691080,0.308920
1,0.046536,0.953464
2,0.914865,0.085135
3,0.098877,0.901123
4,0.227153,0.772847
...,...,...
492,0.892887,0.107113
493,0.898224,0.101776
494,0.845211,0.154789
495,0.722290,0.277710


**Compute accuracy**

In [176]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.87


**Create a confusion matrix**

In [177]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predicted', 'survived_predicted']
df_confusion_matrix = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels1, columns=labels2)

In [178]:
df_confusion_matrix

Unnamed: 0,died_predicted,survived_predicted
died_actual,295,12
survived_actual,51,139


**Create a classification report**

In [197]:
classification_report_2 = classification_report(y_train, y_pred, output_dict = True)

In [200]:
df_classification_report_2 = pd.DataFrame(classification_report_2).T

In [201]:
df_classification_report_2

Unnamed: 0,precision,recall,f1-score,support
0,0.852601,0.960912,0.903522,307.0
1,0.92053,0.731579,0.815249,190.0
accuracy,0.873239,0.873239,0.873239,0.873239
macro avg,0.886565,0.846245,0.859386,497.0
weighted avg,0.87857,0.873239,0.869776,497.0


### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Let's compare the two classification reports for the evaluation metrics.

In [202]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.950464,1.0,0.974603,307.0
1,1.0,0.915789,0.956044,190.0
accuracy,0.967807,0.967807,0.967807,0.967807
macro avg,0.975232,0.957895,0.965324,497.0
weighted avg,0.969402,0.967807,0.967508,497.0


In [203]:
df_classification_report_2

Unnamed: 0,precision,recall,f1-score,support
0,0.852601,0.960912,0.903522,307.0
1,0.92053,0.731579,0.815249,190.0
accuracy,0.873239,0.873239,0.873239,0.873239
macro avg,0.886565,0.846245,0.859386,497.0
weighted avg,0.87857,0.873239,0.869776,497.0


The first sample with min_samples_leaf = 1 and max_depth = 10 performs better than the second sample. This is likely because we are using a higher max depth.