In [1]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
import acquire
import prepare

import warnings
warnings.filterwarnings('ignore')

## Acquire Data

In [2]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.shape, validate.shape, test.shape

((497, 15), (214, 15), (178, 15))

In [3]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

## Clean Data

In [4]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,583,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,337,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,50,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,218,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,31,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 583 to 553
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    497 non-null    int64  
 1   passenger_id  497 non-null    int64  
 2   pclass        497 non-null    int64  
 3   sex           497 non-null    object 
 4   age           497 non-null    float64
 5   sibsp         497 non-null    int64  
 6   parch         497 non-null    int64  
 7   fare          497 non-null    float64
 8   embarked      497 non-null    object 
 9   class         497 non-null    object 
 10  embark_town   497 non-null    object 
 11  alone         497 non-null    int64  
 12  Q             497 non-null    uint8  
 13  S             497 non-null    uint8  
dtypes: float64(2), int64(6), object(4), uint8(2)
memory usage: 51.4+ KB


In [6]:
def clean_data(df):
    '''
    This function will drop any duplicate observations, 
    drop ['deck', 'embarked', 'class', 'age'], fill missing embark_town with 'Southampton'
    and create dummy vars from sex and embark_town. 
    '''
    df = df.drop_duplicates()
    df = df.drop(columns=['embarked', 'class', 'age'])
    df['embark_town'] = df.embark_town.fillna(value='Southampton')
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    return df

In [7]:
X_train = clean_data(X_train)

In [8]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sex,sibsp,parch,fare,embark_town,alone,Q,S,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,583,1,male,0,0,40.125,Cherbourg,1,0,0,1,0,0
337,337,337,1,female,0,0,134.5,Cherbourg,1,0,0,0,0,0
50,50,50,3,male,4,1,39.6875,Southampton,0,0,1,1,0,1
218,218,218,1,female,0,0,76.2917,Cherbourg,1,0,0,0,0,0
31,31,31,1,female,1,0,146.5208,Cherbourg,0,0,0,0,0,0


In [9]:
X_train = X_train.drop(['sex', 'embark_town'], axis=1)

In [10]:
y_train.head()

583    0
337    1
50     0
218    1
31     1
Name: survived, dtype: int64

### What is your baseline prediction?

In [11]:
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

As we can see above, the majority of passengers died, so our baseline assumption is that all passengers died.

### What is your baseline accuracy

Create the object.

In [12]:
model = DummyClassifier(strategy='constant', constant=0)

Fit the object.

In [13]:
model.fit(X_train, y_train)

DummyClassifier(constant=0, strategy='constant')

In [14]:
accuracy = round(model.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.62


### Fit the decision tree classifier to your training sample and transform

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import graphviz
from graphviz import Graph
from sklearn import tree
import numpy
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [16]:
clf1 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [17]:
clf1 = clf1.fit(X_train, y_train)

In [18]:
clf1

DecisionTreeClassifier(max_depth=3, random_state=123)

In [19]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_tree', view=True)

'titanic_tree.pdf'

In [20]:
y_pred = clf1.predict(X_train)

In [22]:
y_pred_proba = clf1.predict_proba(X_train)

In [23]:
y_probability = pd.DataFrame(y_pred_proba)
y_probability.head()

Unnamed: 0,0,1
0,0.525424,0.474576
1,0.012821,0.987179
2,0.722222,0.277778
3,0.012821,0.987179
4,0.1875,0.8125


### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

**Accuracy Score**

In [24]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.81


**Confusion Matrix**

In [25]:
confusion_matrix(y_train, y_pred)

array([[279,  28],
       [ 66, 124]])

In [26]:
y_train.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [27]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,279,28
survived_actual,66,124


**Classification report**

In [90]:
report_train_depth_three = classification_report(y_train, y_pred, output_dict = True)

In [91]:
df_report_train_depth_three = pd.DataFrame(report_train_depth_three).T

In [92]:
df_report_train_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.808696,0.908795,0.855828,307.0
1,0.815789,0.652632,0.725146,190.0
accuracy,0.810865,0.810865,0.810865,0.810865
macro avg,0.812243,0.780713,0.790487,497.0
weighted avg,0.811408,0.810865,0.805869,497.0


### Now we'll take a look at the validate model.

**Clean the data**

In [31]:
X_validate = clean_data(X_validate)

In [32]:
X_validate = X_validate.drop(['sex', 'embark_town'], axis=1)

**Accuracy Score**

In [82]:
y_pred_validate_depth_three = clf.predict(X_validate)

In [41]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.79


**Confusion Matrix**

In [42]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_validate, y_pred_validate_depth_three), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,117,15
survived_actual,30,52


**Create a classification table**

In [83]:
report_validate_depth_three = classification_report(y_validate, y_pred_validate_depth_three, output_dict = True)

In [87]:
df_report_validate_depth_three = pd.DataFrame(report_validate_depth_three).T

In [88]:
df_report_validate_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.826389,0.901515,0.862319,132.0
1,0.814286,0.695122,0.75,82.0
accuracy,0.82243,0.82243,0.82243,0.82243
macro avg,0.820337,0.798319,0.806159,214.0
weighted avg,0.821751,0.82243,0.819281,214.0


### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

**Accuracy**

In [79]:
model_accuracy = accuracy_score(y_train, y_pred)
model_accuracy

0.8189134808853119

**True Positive Rate**

In [85]:
true_positive = 128 
false_negative = 62
true_negative = 279
false_positive = 28

In [86]:
true_positive_rate = (true_positive)/(true_positive + false_negative)
true_positive_rate

0.6736842105263158

**False Positive Rate**

In [88]:
false_positive_rate = (false_positive)/(false_positive+true_negative)
false_positive_rate

0.09120521172638436

**True Negative Rate**

In [90]:
true_negative_rate = (true_negative)/(true_negative + false_positive)
true_negative_rate

0.9087947882736156

**False Negative Rate**

In [93]:
false_negative_rate = (false_negative)/(false_negative + true_positive)
false_negative_rate

0.3263157894736842

**Precision**

In [98]:
model_precision = precision_score(y_train, y_pred, pos_label= 1)
model_precision

0.8205128205128205

**Recall**

In [100]:
model_recall = recall_score(y_train, y_pred, pos_label = 1)
model_recall

0.6736842105263158

**f-1 score**

In [103]:
model_f1_score = f1_score(y_train, y_pred, average=None)
model_f1_score

array([0.86111111, 0.73988439])

**Support**

In [129]:
y_train.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [131]:
y_pred_df = pd.DataFrame(y_pred)

In [132]:
y_pred_df.head()

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,1


In [139]:
y0 = y_pred_df[y_pred_df[0] == 0]
y0.size

314

In [141]:
y1 = y_pred_df[y_pred_df[0] == 1]
y1.size

183

## Run through steps 2-4 using a different max_depth value.

### Training data

In [53]:
clf2 = DecisionTreeClassifier(max_depth=4, random_state=123)

In [54]:
clf2 = clf2.fit(X_train, y_train)

In [94]:
y_pred_train_depth_4 = clf2.predict(X_train)

In [95]:
y_pred_proba = clf2.predict_proba(X_train)

In [96]:
y_probability = pd.DataFrame(y_pred_proba)

**Training data accuracy score**

In [97]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.77


**Traininig data confusion matrix**

In [98]:
confusion_matrix(y_train, y_pred_train_depth_4)

array([[268,  39],
       [ 76, 114]])

In [99]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_train, y_pred_train_depth_4), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,268,39
survived_actual,76,114


**Training Data Classification Report**

In [102]:
classification_report_train_depth_four = classification_report(y_train, y_pred_train_depth_4, output_dict = True)

In [103]:
df_classification_report_train_depth_four = pd.DataFrame(classification_report_train_depth_four).T

In [104]:
df_classification_report_train_depth_four

Unnamed: 0,precision,recall,f1-score,support
0,0.77907,0.872964,0.823349,307.0
1,0.745098,0.6,0.664723,190.0
accuracy,0.768612,0.768612,0.768612,0.768612
macro avg,0.762084,0.736482,0.744036,497.0
weighted avg,0.766083,0.768612,0.762707,497.0


### Validate Data

In [105]:
clf2 = clf2.fit(X_validate, y_validate)

In [106]:
y_pred_validate_depth_4 = clf2.predict(X_validate)

**Validate accuracy**

In [107]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
      .format(clf2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.86


**Validate confusion matrix**

In [108]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_validate, y_pred_validate_depth_4), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,125,7
survived_actual,24,58


**Training data classification report**

In [110]:
classification_report_validate_depth_4 = classification_report(y_validate, y_pred_validate_depth_4, output_dict = True)
df_classification_report_validate_depth_4 = pd.DataFrame(classification_report_validate_depth_4).T

In [111]:
df_classification_report_validate_depth_4

Unnamed: 0,precision,recall,f1-score,support
0,0.838926,0.94697,0.88968,132.0
1,0.892308,0.707317,0.789116,82.0
accuracy,0.85514,0.85514,0.85514,0.85514
macro avg,0.865617,0.827143,0.839398,214.0
weighted avg,0.859381,0.85514,0.851146,214.0


### Which model performs better on your in-sample data?

To answer this question, let's look at the training data classificatino reports side by side.

**Max depth three**

In [113]:
df_report_train_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.808696,0.908795,0.855828,307.0
1,0.815789,0.652632,0.725146,190.0
accuracy,0.810865,0.810865,0.810865,0.810865
macro avg,0.812243,0.780713,0.790487,497.0
weighted avg,0.811408,0.810865,0.805869,497.0


**Max depth four**

In [115]:
df_classification_report_train_depth_four

Unnamed: 0,precision,recall,f1-score,support
0,0.77907,0.872964,0.823349,307.0
1,0.745098,0.6,0.664723,190.0
accuracy,0.768612,0.768612,0.768612,0.768612
macro avg,0.762084,0.736482,0.744036,497.0
weighted avg,0.766083,0.768612,0.762707,497.0


Max depth three out performs Max depth four on every observable measure.

### Which model performs best on your out-of-sample data, the validate set?

**Max depth three**

In [118]:
df_report_validate_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.826389,0.901515,0.862319,132.0
1,0.814286,0.695122,0.75,82.0
accuracy,0.82243,0.82243,0.82243,0.82243
macro avg,0.820337,0.798319,0.806159,214.0
weighted avg,0.821751,0.82243,0.819281,214.0


**Max depth four**

In [119]:
df_classification_report_validate_depth_4

Unnamed: 0,precision,recall,f1-score,support
0,0.838926,0.94697,0.88968,132.0
1,0.892308,0.707317,0.789116,82.0
accuracy,0.85514,0.85514,0.85514,0.85514
macro avg,0.865617,0.827143,0.839398,214.0
weighted avg,0.859381,0.85514,0.851146,214.0


Max depth three outperforms Max depth three for the validate data sets.