In [2]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
import acquire
import prepare

import warnings
warnings.filterwarnings('ignore')

# Decision Tree

## Acquire Data

In [3]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.shape, validate.shape, test.shape

((497, 15), (214, 15), (178, 15))

In [4]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

## Clean Data

In [5]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
583,583,583,1,male,36.0,0,0,40.125,C,First,Cherbourg,1,0,0
337,337,337,1,female,41.0,0,0,134.5,C,First,Cherbourg,1,0,0
50,50,50,3,male,7.0,4,1,39.6875,S,Third,Southampton,0,0,1
218,218,218,1,female,32.0,0,0,76.2917,C,First,Cherbourg,1,0,0
31,31,31,1,female,29.916875,1,0,146.5208,C,First,Cherbourg,0,0,0


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497 entries, 583 to 553
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    497 non-null    int64  
 1   passenger_id  497 non-null    int64  
 2   pclass        497 non-null    int64  
 3   sex           497 non-null    object 
 4   age           497 non-null    float64
 5   sibsp         497 non-null    int64  
 6   parch         497 non-null    int64  
 7   fare          497 non-null    float64
 8   embarked      497 non-null    object 
 9   class         497 non-null    object 
 10  embark_town   497 non-null    object 
 11  alone         497 non-null    int64  
 12  Q             497 non-null    uint8  
 13  S             497 non-null    uint8  
dtypes: float64(2), int64(6), object(4), uint8(2)
memory usage: 51.4+ KB


In [7]:
def clean_data(df):
    '''
    This function will drop any duplicate observations, 
    drop ['deck', 'embarked', 'class', 'age'], fill missing embark_town with 'Southampton'
    and create dummy vars from sex and embark_town. 
    '''
    df = df.drop_duplicates()
    df = df.drop(columns=['embarked', 'class', 'age'])
    df['embark_town'] = df.embark_town.fillna(value='Southampton')
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    return df

In [8]:
X_train = clean_data(X_train)

In [9]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sex,sibsp,parch,fare,embark_town,alone,Q,S,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,583,1,male,0,0,40.125,Cherbourg,1,0,0,1,0,0
337,337,337,1,female,0,0,134.5,Cherbourg,1,0,0,0,0,0
50,50,50,3,male,4,1,39.6875,Southampton,0,0,1,1,0,1
218,218,218,1,female,0,0,76.2917,Cherbourg,1,0,0,0,0,0
31,31,31,1,female,1,0,146.5208,Cherbourg,0,0,0,0,0,0


In [10]:
X_train = X_train.drop(['sex', 'embark_town'], axis=1)

In [11]:
y_train.head()

583    0
337    1
50     0
218    1
31     1
Name: survived, dtype: int64

### What is your baseline prediction?

In [12]:
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

As we can see above, the majority of passengers died, so our baseline assumption is that all passengers died.

### What is your baseline accuracy

Create the object.

In [13]:
model = DummyClassifier(strategy='constant', constant=0)

Fit the object.

In [14]:
model.fit(X_train, y_train)

DummyClassifier(constant=0, strategy='constant')

In [15]:
accuracy = round(model.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.62


### Fit the decision tree classifier to your training sample and transform

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import graphviz
from graphviz import Graph
from sklearn import tree
import numpy
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [17]:
clf1 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [18]:
clf1 = clf1.fit(X_train, y_train)

In [19]:
clf1

DecisionTreeClassifier(max_depth=3, random_state=123)

In [20]:
dot_data = export_graphviz(clf1, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_tree', view=True)

'titanic_tree.pdf'

In [21]:
y_pred = clf1.predict(X_train)

In [22]:
y_pred_proba = clf1.predict_proba(X_train)

In [23]:
y_probability = pd.DataFrame(y_pred_proba)
y_probability.head()

Unnamed: 0,0,1
0,0.525424,0.474576
1,0.012821,0.987179
2,0.722222,0.277778
3,0.012821,0.987179
4,0.1875,0.8125


### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

**Accuracy Score**

In [24]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.81


**Confusion Matrix**

In [25]:
confusion_matrix(y_train, y_pred)

array([[279,  28],
       [ 66, 124]])

In [26]:
y_train.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [27]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,279,28
survived_actual,66,124


**Classification report**

In [28]:
report_train_depth_three = classification_report(y_train, y_pred, output_dict = True)

In [29]:
df_report_train_depth_three = pd.DataFrame(report_train_depth_three).T

In [30]:
df_report_train_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.808696,0.908795,0.855828,307.0
1,0.815789,0.652632,0.725146,190.0
accuracy,0.810865,0.810865,0.810865,0.810865
macro avg,0.812243,0.780713,0.790487,497.0
weighted avg,0.811408,0.810865,0.805869,497.0


### Now we'll take a look at the validate model.

**Clean the data**

In [31]:
X_validate = clean_data(X_validate)

In [32]:
X_validate = X_validate.drop(['sex', 'embark_town'], axis=1)

**Accuracy Score**

In [33]:
y_pred_validate_depth_three = clf1.predict(X_validate)

In [34]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf1.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on training set: 0.79


**Confusion Matrix**

In [35]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_validate, y_pred_validate_depth_three), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,117,15
survived_actual,30,52


**Create a classification table**

In [36]:
report_validate_depth_three = classification_report(y_validate, y_pred_validate_depth_three, output_dict = True)

In [37]:
df_report_validate_depth_three = pd.DataFrame(report_validate_depth_three).T

In [38]:
df_report_validate_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.795918,0.886364,0.83871,132.0
1,0.776119,0.634146,0.697987,82.0
accuracy,0.78972,0.78972,0.78972,0.78972
macro avg,0.786019,0.760255,0.768348,214.0
weighted avg,0.788332,0.78972,0.784788,214.0


### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

**Accuracy**

In [39]:
model_accuracy = accuracy_score(y_train, y_pred)
model_accuracy

0.8108651911468813

**True Positive Rate**

In [40]:
true_positive = 128 
false_negative = 62
true_negative = 279
false_positive = 28

In [41]:
true_positive_rate = (true_positive)/(true_positive + false_negative)
true_positive_rate

0.6736842105263158

**False Positive Rate**

In [42]:
false_positive_rate = (false_positive)/(false_positive+true_negative)
false_positive_rate

0.09120521172638436

**True Negative Rate**

In [43]:
true_negative_rate = (true_negative)/(true_negative + false_positive)
true_negative_rate

0.9087947882736156

**False Negative Rate**

In [44]:
false_negative_rate = (false_negative)/(false_negative + true_positive)
false_negative_rate

0.3263157894736842

**Precision**

In [45]:
model_precision = precision_score(y_train, y_pred, pos_label= 1)
model_precision

0.8157894736842105

**Recall**

In [46]:
model_recall = recall_score(y_train, y_pred, pos_label = 1)
model_recall

0.6526315789473685

**f-1 score**

In [47]:
model_f1_score = f1_score(y_train, y_pred, average=None)
model_f1_score

array([0.85582822, 0.7251462 ])

**Support**

In [48]:
y_train.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [49]:
y_pred_df = pd.DataFrame(y_pred)

In [50]:
y_pred_df.head()

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,1


In [51]:
y0 = y_pred_df[y_pred_df[0] == 0]
y0.size

345

In [52]:
y1 = y_pred_df[y_pred_df[0] == 1]
y1.size

152

## Run through steps 2-4 using a different max_depth value.

### Training data

In [53]:
clf2 = DecisionTreeClassifier(max_depth=4, random_state=123)

In [54]:
clf2 = clf2.fit(X_train, y_train)

In [55]:
y_pred_train_depth_4 = clf2.predict(X_train)

In [56]:
y_pred_proba = clf2.predict_proba(X_train)

In [57]:
y_probability = pd.DataFrame(y_pred_proba)

**Training data accuracy score**

In [58]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.85


**Traininig data confusion matrix**

In [59]:
confusion_matrix(y_train, y_pred_train_depth_4)

array([[283,  24],
       [ 53, 137]])

In [60]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_train, y_pred_train_depth_4), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,283,24
survived_actual,53,137


**Training Data Classification Report**

In [61]:
classification_report_train_depth_four = classification_report(y_train, y_pred_train_depth_4, output_dict = True)

In [62]:
df_classification_report_train_depth_four = pd.DataFrame(classification_report_train_depth_four).T

In [63]:
df_classification_report_train_depth_four

Unnamed: 0,precision,recall,f1-score,support
0,0.842262,0.921824,0.880249,307.0
1,0.850932,0.721053,0.780627,190.0
accuracy,0.84507,0.84507,0.84507,0.84507
macro avg,0.846597,0.821438,0.830438,497.0
weighted avg,0.845576,0.84507,0.842164,497.0


### Validate Data

In [64]:
clf2 = clf2.fit(X_validate, y_validate)

In [65]:
y_pred_validate_depth_4 = clf2.predict(X_validate)

**Validate accuracy**

In [66]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
      .format(clf2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.86


**Validate confusion matrix**

In [67]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']

pd.DataFrame(confusion_matrix(y_validate, y_pred_validate_depth_4), index=labels1, columns=labels2)

Unnamed: 0,died_predict,survived_predict
died_actual,125,7
survived_actual,24,58


**Training data classification report**

In [68]:
classification_report_validate_depth_4 = classification_report(y_validate, y_pred_validate_depth_4, output_dict = True)
df_classification_report_validate_depth_4 = pd.DataFrame(classification_report_validate_depth_4).T

In [69]:
df_classification_report_validate_depth_4

Unnamed: 0,precision,recall,f1-score,support
0,0.838926,0.94697,0.88968,132.0
1,0.892308,0.707317,0.789116,82.0
accuracy,0.85514,0.85514,0.85514,0.85514
macro avg,0.865617,0.827143,0.839398,214.0
weighted avg,0.859381,0.85514,0.851146,214.0


### Which model performs better on your in-sample data?

To answer this question, let's look at the training data classificatino reports side by side.

**Max depth three**

In [70]:
df_report_train_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.808696,0.908795,0.855828,307.0
1,0.815789,0.652632,0.725146,190.0
accuracy,0.810865,0.810865,0.810865,0.810865
macro avg,0.812243,0.780713,0.790487,497.0
weighted avg,0.811408,0.810865,0.805869,497.0


**Max depth four**

In [71]:
df_classification_report_train_depth_four

Unnamed: 0,precision,recall,f1-score,support
0,0.842262,0.921824,0.880249,307.0
1,0.850932,0.721053,0.780627,190.0
accuracy,0.84507,0.84507,0.84507,0.84507
macro avg,0.846597,0.821438,0.830438,497.0
weighted avg,0.845576,0.84507,0.842164,497.0


Max depth three out performs Max depth four on every observable measure.

### Which model performs best on your out-of-sample data, the validate set?

**Max depth three**

In [72]:
df_report_validate_depth_three

Unnamed: 0,precision,recall,f1-score,support
0,0.795918,0.886364,0.83871,132.0
1,0.776119,0.634146,0.697987,82.0
accuracy,0.78972,0.78972,0.78972,0.78972
macro avg,0.786019,0.760255,0.768348,214.0
weighted avg,0.788332,0.78972,0.784788,214.0


**Max depth four**

In [73]:
df_classification_report_validate_depth_4

Unnamed: 0,precision,recall,f1-score,support
0,0.838926,0.94697,0.88968,132.0
1,0.892308,0.707317,0.789116,82.0
accuracy,0.85514,0.85514,0.85514,0.85514
macro avg,0.865617,0.827143,0.839398,214.0
weighted avg,0.859381,0.85514,0.851146,214.0


Max depth three outperforms Max depth three for the validate data sets.

# Random Forest

### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [74]:
from sklearn.ensemble import RandomForestClassifier

In [75]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

**Fit the model**

In [76]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

**Feature importance**

In [77]:
pd.DataFrame(rf.feature_importances_)

Unnamed: 0,0
0,0.158897
1,0.162192
2,0.099323
3,0.04761
4,0.031066
5,0.195731
6,0.029676
7,0.007023
8,0.01326
9,0.233667


**Make predictions**

In [78]:
y_pred = rf.predict(X_train)

In [79]:
pd.DataFrame(y_pred)

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,1
...,...
492,0
493,0
494,0
495,0


**Estimate probability**

In [80]:
y_pred_proba = rf.predict_proba(X_train)

In [81]:
pd.DataFrame(y_pred_proba)

Unnamed: 0,0,1
0,0.845507,0.154493
1,0.021714,0.978286
2,0.977273,0.022727
3,0.010714,0.989286
4,0.100000,0.900000
...,...,...
492,0.942332,0.057668
493,0.906272,0.093728
494,0.799098,0.200902
495,0.921866,0.078134


### Evaluate your results using the model score, confusion matrix, and classification report.

**Compute Accuracy**

In [82]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


**Create a confusion matrix**

In [83]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predicted', 'survived_predicted']
df_confusion_matrix = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels1, columns=labels2)

In [84]:
df_confusion_matrix

Unnamed: 0,died_predicted,survived_predicted
died_actual,307,0
survived_actual,16,174


**Create a classification report**

In [85]:
classification_report = classification_report(y_train, y_pred, output_dict = True)

In [86]:
df_classification_report = pd.DataFrame(classification_report).T

In [87]:
df_classification_report

Unnamed: 0,precision,recall,f1-score,support
0,0.950464,1.0,0.974603,307.0
1,1.0,0.915789,0.956044,190.0
accuracy,0.967807,0.967807,0.967807,0.967807
macro avg,0.975232,0.957895,0.965324,497.0
weighted avg,0.969402,0.967807,0.967508,497.0


### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [88]:
true_positives = 174 
false_positives = 0 
true_negatives = 307
false_negatives = 16

**Accuracy**

In [89]:
model_accuracy = accuracy_score(y_train, y_pred)
model_accuracy

0.9678068410462777

**True positive rate**

In [90]:
true_positive_rate = (true_positives)/(true_positives + false_negatives)

In [91]:
true_positive_rate

0.9157894736842105

**False negative rate**

In [92]:
false_negative_rate = (false_negatives)/(false_negatives + true_positives)

In [93]:
false_negative_rate

0.08421052631578947

**True negative rate**

In [94]:
true_negative_rate = (true_negatives)/(true_negatives + false_positives)

In [95]:
true_negative_rate

1.0

**False Posititve  rate**

In [96]:
false_positive_rate = (false_positives)/(false_positives + true_negatives)

In [97]:
false_positive_rate

0.0

**Precision rate**

In [98]:
precision = precision_score(y_train, y_pred)

In [99]:
precision

1.0

**Recall**

In [100]:
recall = recall_score(y_train, y_pred)

In [101]:
recall

0.9157894736842105

**F-1 score**

In [102]:
f1 = f1_score(y_train, y_pred)

In [103]:
f1

0.956043956043956

**Support**

In [104]:
support_1 = true_positives + false_negatives 
support_0 = false_positives + true_negatives
support_total = true_positives + false_negatives + false_positives + true_negatives

In [105]:
print(f'Support 1: {support_1}')
print(f'Support 0: {support_0}')
print(f'Support total: {support_total}')

Support 1: 190
Support 0: 307
Support total: 497


### Run through steps increasing your min_samples_leaf and decreasing your max_depth.

This time, let's set min_samples_leaf to 3, and max_depth to 6.

In [106]:
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=6, 
                            random_state=123)

**Fit the model**

In [107]:
rf2.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=3, random_state=123)

**Feature importance**

In [108]:
pd.DataFrame(rf2.feature_importances_)

Unnamed: 0,0
0,0.095989
1,0.096242
2,0.133684
3,0.047151
4,0.026138
5,0.169599
6,0.040284
7,0.00478
8,0.013097
9,0.355629


**Make predictions**

In [109]:
y_pred = rf2.predict(X_train)
pd.DataFrame(y_pred)

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,1
...,...
492,0
493,0
494,0
495,0


**Estimate probability**

In [110]:
y_pred_proba = rf2.predict_proba(X_train)
pd.DataFrame(y_pred_proba)

Unnamed: 0,0,1
0,0.691080,0.308920
1,0.046536,0.953464
2,0.914865,0.085135
3,0.098877,0.901123
4,0.227153,0.772847
...,...,...
492,0.892887,0.107113
493,0.898224,0.101776
494,0.845211,0.154789
495,0.722290,0.277710


**Compute accuracy**

In [111]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.87


**Create a confusion matrix**

In [112]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predicted', 'survived_predicted']
df_confusion_matrix = pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels1, columns=labels2)

In [113]:
df_confusion_matrix

Unnamed: 0,died_predicted,survived_predicted
died_actual,295,12
survived_actual,51,139


**Create a classification report**

In [114]:
classification_report_2 = classification_report(y_train, y_pred, output_dict = True)

TypeError: 'dict' object is not callable

In [None]:
df_classification_report_2 = pd.DataFrame(classification_report_2).T

In [None]:
df_classification_report_2

### What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Let's compare the two classification reports for the evaluation metrics.

In [None]:
df_classification_report

In [None]:
df_classification_report_2

The first sample with min_samples_leaf = 1 and max_depth = 10 performs better than the second sample. This is likely because we are using a higher max depth.

# K-Nearest Neighbor

In [141]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from pydataset import data

## Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample).

#### Create a KNN Object

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

#### Fit the model to the training data

In [None]:
knn.fit(X_train, y_train)

#### Make predictions

In [None]:
y_pred = knn.predict(X_train)

In [None]:
y_pred2 = knn.predict(X_validate)

#### Estimate probability

In [None]:
y_pred_proba = knn.predict_proba(X_train)

In [None]:
df_probability = pd.DataFrame(y_pred_proba)

In [None]:
df_probability

In [None]:
y_pred_proba2 = knn.predict_proba(X_validate)

In [None]:
df_probability2 = pd.DataFrame(y_pred_proba2)

In [None]:
df_probability2

## Evaluate your results using the model score, confusion matrix, and classification report.

#### Model score

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))

In [None]:
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

#### Confusion matrix

In [None]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']
df_knn_confusion_matrix = pd.DataFrame(confusion_matrix(y_train, y_pred), index = labels1, columns= labels2)

In [None]:
df_knn_confusion_matrix

In [None]:
df_knn_confusion_matrix_validate = pd.DataFrame(confusion_matrix(y_validate, y_pred2), index = labels1, columns= labels2)

In [None]:
df_knn_confusion_matrix_validate

#### Classification report

In [None]:
df_knn_classification_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict = True)).T

In [None]:
df_knn_classification_report

In [None]:
df_knn_classification_report_validate = pd.DataFrame(classification_report(y_validate, y_pred2, output_dict = True)).T

In [None]:
df_knn_classification_report_validate

## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [140]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
true_positives = 101 
false_positives = 36 
true_negatives = 271
false_negatives = 89

#### Accuracy

In [None]:
model_accuracy = accuracy_score(y_train, y_pred)
model_accuracy

#### True positive rate

In [None]:
true_positive_rate = (true_positives)/(true_positives + false_negatives)
true_positive_rate

#### False positive rate

In [None]:
false_positive_rate = (false_positives)/(false_positives + true_negatives)
false_positive_rate

#### True negative rate

In [None]:
true_negative_rate = (true_negatives)/(false_positives + true_negatives)
true_negative_rate

#### False negative rate

In [None]:
false_negative_rate = (false_negatives)/(true_positives + false_negatives)
false_negative_rate

#### Precision score

In [None]:
precision = precision_score(y_train, y_pred)
precision

#### Recall score

In [None]:
recall = recall_score(y_train, y_pred)
recall

#### f1 score

In [None]:
f1 = f1_score(y_train, y_pred)
f1

#### Support

In [None]:
support_1 = true_positives + false_negatives 
support_0 = false_positives + true_negatives
support_total = true_positives + false_negatives + false_positives + true_negatives

In [None]:
print(f'Support 1: {support_1}')
print(f'Support 0: {support_0}')
print(f'Support total: {support_total}')

## Run through steps 2-4 setting k to 10

#### Create a KNN object

In [None]:
knn_10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')

#### Fit the model to the training  data

In [None]:
knn_10.fit(X_train, y_train)

#### Make predictions

In [None]:
y_pred_10_train = knn_10.predict(X_train)

In [None]:
y_pred_10_validate = knn_10.predict(X_validate)

#### Estimate probability 

In [None]:
y_pred_proba_10_train = knn_10.predict_proba(X_train)
df_prob_10_train = pd.DataFrame(y_pred_proba_10_train)
df_prob_10_train.head()

In [None]:
y_pred_proba_10_validate = knn_10.predict_proba(X_validate)
df_prob_10_validate = pd.DataFrame(y_pred_proba_10_validate)
df_prob_10_validate.head()

#### Model score

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn_10.score(X_train, y_train)))

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn_10.score(X_validate, y_validate)))

#### Confusion matrix

In [None]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']
df_knn_10_confusion_matrix = pd.DataFrame(confusion_matrix(y_train, y_pred_10_train), index = labels1, columns= labels2)

In [None]:
df_knn_10_confusion_matrix

In [None]:
df_knn_10_confusion_matrix_validate = pd.DataFrame(confusion_matrix(y_validate, y_pred_10_validate), index = labels1, columns= labels2)

In [None]:
df_knn_10_confusion_matrix_validate

#### Classification report

In [None]:
df_knn_10_train_classification_report = pd.DataFrame(classification_report(y_train, y_pred_10_train, output_dict = True)).T

In [None]:
df_knn_10_train_classification_report

In [None]:
df_knn_10_validate_classification_report = pd.DataFrame(classification_report(y_validate, y_pred_10_validate, output_dict = True)).T

In [None]:
df_knn_10_validate_classification_report

## Run through setps 2-4 setting k to 20

#### Create a KNN object

In [None]:
knn_20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')

#### Fit the model to the training data

In [None]:
knn_20.fit(X_train, y_train)

#### Make predictions

In [None]:
y_pred_20_train = knn_20.predict(X_train)

In [None]:
y_pred_20_validate = knn_20.predict(X_validate)

#### Estimate probability

In [None]:
y_pred_proba_20_train = knn_20.predict_proba(X_train)
df_prob_20_train = pd.DataFrame(y_pred_proba_20_train)
df_prob_20_train.head()

In [None]:
y_pred_proba_20_validate = knn_20.predict_proba(X_validate)
df_prob_20_validate = pd.DataFrame(y_pred_proba_20_validate)
df_prob_20_validate.head()

#### Model score

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn_20.score(X_train, y_train)))

In [None]:
print('Accuracy of KNN classifier on validate set: {:.2f}'
     .format(knn_20.score(X_validate, y_validate)))

#### Confusion matrix

In [None]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']
df_knn_20_train_confusion_matrix = pd.DataFrame(confusion_matrix(y_train, y_pred_20_train), index = labels1, columns= labels2)

In [None]:
df_knn_20_train_confusion_matrix

In [None]:
df_knn_20_validate_confusion_matrix = pd.DataFrame(confusion_matrix(y_validate, y_pred_20_validate), index = labels1, columns= labels2)

In [None]:
df_knn_20_validate_confusion_matrix

#### Classification report

In [None]:
df_knn_20_train_classification_report = pd.DataFrame(classification_report(y_train, y_pred_20_train, output_dict = True)).T

In [None]:
df_knn_20_train_classification_report

In [None]:
df_knn_20_validate_classification_report = pd.DataFrame(classification_report(y_validate, y_pred_20_validate, output_dict = True)).T

In [None]:
df_knn_20_validate_classification_report

## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [None]:
df_knn_classification_report

In [None]:
df_knn_10_train_classification_report

In [None]:
df_knn_20_train_classification_report

On the in-training data, k=5 performs the best. K=10 performs second best, with K=20 performing the worst. 

The curse of dimensionality may be to blame for the difference in performance.

## Which model performs best on our out-of-sample data from validate?

In [None]:
df_knn_classification_report_validate

In [None]:
df_knn_10_validate_classification_report

In [None]:
df_knn_20_validate_classification_report

For the validate dataset, K=20 is the most accurate. K=10 is the second most accurate. K=5 is the least accurate.

# Logistic Regression

In [333]:
from sklearn.linear_model import LogisticRegression

First, we will obtain and clean data for logistic regression.

In [334]:
train, validate, test = prepare.prep_titanic(acquire.get_titanic_data())
train.shape, validate.shape, test.shape

((497, 15), (214, 15), (178, 15))

In [335]:
X_train, y_train = train.drop(columns='survived'), train.survived
X_validate, y_validate = validate.drop(columns='survived'), validate.survived
X_test, y_test = test.drop(columns='survived'), test.survived

In [336]:
def clean_data(df):
    '''
    This function will drop any duplicate observations, 
    drop ['deck', 'embarked', 'class', 'age'], fill missing embark_town with 'Southampton'
    and create dummy vars from sex and embark_town. 
    '''
    df = df.drop_duplicates()
    df = df.drop(columns=['embarked', 'class'])
    df['embark_town'] = df.embark_town.fillna(value='Southampton')
    dummy_df = pd.get_dummies(df[['sex', 'embark_town']], drop_first=True)
    df = pd.concat([df, dummy_df], axis=1)
    return df

In [337]:
X_train = clean_data(X_train)

In [338]:
X_validate = clean_data(X_validate)

In [339]:
new_columns = ['age', 'fare', 'pclass']
X_train_1 = X_train[new_columns]
X_validate_1 = X_validate[new_columns]

In [340]:
X_train_1.head()

Unnamed: 0,age,fare,pclass
583,36.0,40.125,1
337,41.0,134.5,1
50,7.0,39.6875,3
218,32.0,76.2917,1
31,29.916875,146.5208,1


In [341]:
X_validate_1.head()

Unnamed: 0,age,fare,pclass
610,39.0,31.275,3
424,18.0,20.2125,3
568,29.916875,7.2292,3
701,35.0,26.2875,1
101,29.916875,7.8958,3


### Create a model that includes age in addition to fare and pclass. 

#### Create the object

In [342]:
logit = LogisticRegression(C=1, random_state=123)

#### Fit the model

In [343]:
logit.fit(X_train_1, y_train)

LogisticRegression(C=1, random_state=123)

#### Feature importance

In [344]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03051881  0.00266519 -0.97983178]]
Intercept: 
 [2.52970125]


#### Make predictions

In [345]:
y_pred_1_train = logit.predict(X_train_1)

In [346]:
y_pred_1_validate = logit.predict(X_validate_1)

#### Estimate probability

In [347]:
y_pred_proba_train_1 = logit.predict_proba(X_train_1)
df_y_pred_proba_train_1 = pd.DataFrame(y_pred_proba_train_1, columns = ['survived', 'died'])
df_y_pred_proba_train_1.head()

Unnamed: 0,survived,died
0,0.36398,0.63602
1,0.341399,0.658601
2,0.626598,0.373402
3,0.315053,0.684947
4,0.263599,0.736401


In [348]:
y_pred_proba_validate_1 = logit.predict_proba(X_validate_1)
df_y_pred_proba_validate_1 = pd.DataFrame(y_pred_proba_validate_1, columns = ['survived', 'died'])
df_y_pred_proba_validate_1.head()

Unnamed: 0,survived,died
0,0.820048,0.179952
1,0.71203,0.28797
2,0.786433,0.213567
3,0.365453,0.634547
4,0.786134,0.213866


#### Classification report

In [349]:
df_classification_report_train_1 = pd.DataFrame(classification_report(y_train, y_pred_1_train, output_dict = True)).T

In [350]:
df_classification_report_train_1

Unnamed: 0,precision,recall,f1-score,support
0,0.728022,0.863192,0.789866,307.0
1,0.684211,0.478947,0.563467,190.0
accuracy,0.716298,0.716298,0.716298,0.716298
macro avg,0.706116,0.67107,0.676667,497.0
weighted avg,0.711273,0.716298,0.703315,497.0


In [351]:
df_classification_report_validate_1 = pd.DataFrame(classification_report(y_validate, y_pred_1_validate, output_dict = True)).T

In [352]:
df_classification_report_validate_1

Unnamed: 0,precision,recall,f1-score,support
0,0.734177,0.878788,0.8,132.0
1,0.714286,0.487805,0.57971,82.0
accuracy,0.728972,0.728972,0.728972,0.728972
macro avg,0.724231,0.683296,0.689855,214.0
weighted avg,0.726555,0.728972,0.71559,214.0


Does this model perform better than your baseline?

In [353]:
accuracy = round(model.score(X_train, y_train), 2)
print(f'Training accuracy: {accuracy}')

Training accuracy: 0.62


The training accuracy of this model is higher than the baseline training accuracy. So, yes, this model performs better than the baseline.

### Include sex in your model as well.

In [354]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,fare,embark_town,alone,Q,S,sex_male,embark_town_Queenstown,embark_town_Southampton
583,583,583,1,male,36.0,0,0,40.125,Cherbourg,1,0,0,1,0,0
337,337,337,1,female,41.0,0,0,134.5,Cherbourg,1,0,0,0,0,0
50,50,50,3,male,7.0,4,1,39.6875,Southampton,0,0,1,1,0,1
218,218,218,1,female,32.0,0,0,76.2917,Cherbourg,1,0,0,0,0,0
31,31,31,1,female,29.916875,1,0,146.5208,Cherbourg,0,0,0,0,0,0


In [355]:
X_train_2 = X_train.drop(columns = ['Unnamed: 0', 'passenger_id', 'sex', 'sibsp', 'parch', 'embark_town', 'alone', 'Q', 'S', 'embark_town_Queenstown', 'embark_town_Southampton'])

In [356]:
X_train_2.head()

Unnamed: 0,pclass,age,fare,sex_male
583,1,36.0,40.125,1
337,1,41.0,134.5,0
50,3,7.0,39.6875,1
218,1,32.0,76.2917,0
31,1,29.916875,146.5208,0


In [357]:
X_validate_2 = X_validate.drop(columns = ['Unnamed: 0', 'passenger_id', 'sex', 'sibsp', 'parch', 'embark_town', 'alone', 'Q', 'S'])

In [358]:
X_validate_2 = X_validate_2.drop(columns = ['embark_town_Queenstown', 'embark_town_Southampton'])

In [359]:
X_validate_2.head()

Unnamed: 0,pclass,age,fare,sex_male
610,3,39.0,31.275,0
424,3,18.0,20.2125,1
568,3,29.916875,7.2292,1
701,1,35.0,26.2875,1
101,3,29.916875,7.8958,1


#### Create the object

In [360]:
logit2 = LogisticRegression(C=1, random_state=123)

#### Fit the model

In [361]:
logit2.fit(X_train_2, y_train)

LogisticRegression(C=1, random_state=123)

#### Feature importance

In [362]:
print('Coefficient: \n', logit2.coef_)
print('Intercept: \n', logit2.intercept_)

Coefficient: 
 [[-1.11402368e+00 -2.66594879e-02  9.02716903e-04 -2.45878213e+00]]
Intercept: 
 [4.30664987]


#### Make predictions

In [363]:
y_pred_train_2 = logit2.predict(X_train_2)

In [364]:
y_pred_validate_2 = logit2.predict(X_validate_2)

#### Estimate probability

In [365]:
y_pred_proba_train_2 = logit2.predict_proba(X_train_2)
df_y_pred_proba_train_2 = pd.DataFrame(y_pred_proba_train_2, columns = ['survived', 'died'])

In [366]:
df_y_pred_proba_train_2.head()

Unnamed: 0,survived,died
0,0.547277,0.452723
1,0.09788,0.90212
2,0.838213,0.161787
3,0.082535,0.917465
4,0.073965,0.926035


In [367]:
y_pred_proba_validate_2 = logit2.predict_proba(X_validate_2)
df_y_pred_proba_validate_2 = pd.DataFrame(y_pred_proba_validate_2, columns = ['survived', 'died'])

In [368]:
df_y_pred_proba_validate_2.head()

Unnamed: 0,survived,died
0,0.511725,0.488275
1,0.876081,0.123919
2,0.907648,0.092352
3,0.543765,0.456235
4,0.907597,0.092403


#### Classification report

In [369]:
df_classification_report_train_2 = pd.DataFrame(classification_report(y_train, y_pred_train_2, output_dict =True)).T

In [370]:
df_classification_report_train_2

Unnamed: 0,precision,recall,f1-score,support
0,0.824451,0.856678,0.840256,307.0
1,0.752809,0.705263,0.728261,190.0
accuracy,0.798793,0.798793,0.798793,0.798793
macro avg,0.78863,0.78097,0.784258,497.0
weighted avg,0.797063,0.798793,0.797441,497.0


In [371]:
df_classification_report_validate_2 = pd.DataFrame(classification_report(y_validate, y_pred_validate_2, output_dict =True)).T

In [372]:
df_classification_report_validate_2

Unnamed: 0,precision,recall,f1-score,support
0,0.816176,0.840909,0.828358,132.0
1,0.730769,0.695122,0.7125,82.0
accuracy,0.785047,0.785047,0.785047,0.785047
macro avg,0.773473,0.768016,0.770429,214.0
weighted avg,0.78345,0.785047,0.783964,214.0


Including sex increases the accuracy versus omitting it. 

### Try out other combinations of features and models.

In [373]:
X_train_3 = X_train.drop(columns = ['Unnamed: 0', 'passenger_id', 'sex', 'sibsp', 'parch', 'embark_town', 'Q', 'S', 'embark_town_Queenstown', 'embark_town_Southampton'])

In [374]:
X_validate_3 = X_validate.drop(columns = ['Unnamed: 0', 'passenger_id', 'sex', 'sibsp', 'parch', 'embark_town', 'Q', 'S', 'embark_town_Queenstown', 'embark_town_Southampton'])

In [375]:
logit3 = LogisticRegression(C=1, random_state=123)

In [376]:
logit3.fit(X_train_3, y_train)

LogisticRegression(C=1, random_state=123)

In [396]:
print('Coefficient: \n', logit3.coef_)
print('Intercept: \n', logit3.intercept_)

Coefficient: 
 [[-1.10859199e+00 -2.55633270e-02  5.53491876e-04 -1.58214588e-01
  -2.41546062e+00]]
Intercept: 
 [4.33705386]


In [378]:
y_pred_train_3 = logit3.predict(X_train_3)

In [379]:
y_pred_validate_3 = logit3.predict(X_validate_3)

In [380]:
y_pred_proba_train_3 = logit3.predict_proba(X_train_3)

In [381]:
y_pred_proba_validate_3 = logit3.predict_proba(X_validate_3)

In [382]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit3.score(X_train_3, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.80


In [383]:
labels1 = ['died_actual', 'survived_actual']
labels2 = ['died_predict', 'survived_predict']
df_confusion_matrix_3 = pd.DataFrame(confusion_matrix(y_train, y_pred_train_3), index=labels1, columns = labels2)

In [384]:
df_confusion_matrix_3

Unnamed: 0,died_predict,survived_predict
died_actual,263,44
survived_actual,57,133


In [385]:
df_classification_report_train_3 = pd.DataFrame(classification_report(y_train, y_pred_train_3, output_dict = True)).T

In [386]:
df_classification_report_train_3

Unnamed: 0,precision,recall,f1-score,support
0,0.821875,0.856678,0.838915,307.0
1,0.751412,0.7,0.724796,190.0
accuracy,0.796781,0.796781,0.796781,0.796781
macro avg,0.786644,0.778339,0.781856,497.0
weighted avg,0.794938,0.796781,0.795288,497.0


In [387]:
df_classification_report_validate_3 = pd.DataFrame(classification_report(y_validate, y_pred_validate_3, output_dict = True)).T

In [388]:
df_classification_report_validate_3

Unnamed: 0,precision,recall,f1-score,support
0,0.820896,0.833333,0.827068,132.0
1,0.725,0.707317,0.716049,82.0
accuracy,0.785047,0.785047,0.785047,0.785047
macro avg,0.772948,0.770325,0.771559,214.0
weighted avg,0.784151,0.785047,0.784528,214.0


### Use you best 3 models to predict and evaluate on your validate sample.

In [389]:
df_classification_report_validate_1

Unnamed: 0,precision,recall,f1-score,support
0,0.734177,0.878788,0.8,132.0
1,0.714286,0.487805,0.57971,82.0
accuracy,0.728972,0.728972,0.728972,0.728972
macro avg,0.724231,0.683296,0.689855,214.0
weighted avg,0.726555,0.728972,0.71559,214.0


In [390]:
df_classification_report_validate_2

Unnamed: 0,precision,recall,f1-score,support
0,0.816176,0.840909,0.828358,132.0
1,0.730769,0.695122,0.7125,82.0
accuracy,0.785047,0.785047,0.785047,0.785047
macro avg,0.773473,0.768016,0.770429,214.0
weighted avg,0.78345,0.785047,0.783964,214.0


In [391]:
df_classification_report_validate_3

Unnamed: 0,precision,recall,f1-score,support
0,0.820896,0.833333,0.827068,132.0
1,0.725,0.707317,0.716049,82.0
accuracy,0.785047,0.785047,0.785047,0.785047
macro avg,0.772948,0.770325,0.771559,214.0
weighted avg,0.784151,0.785047,0.784528,214.0


The third model is the most accurate model on the validate dataset. 

### Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [393]:
X_test_3 = clean_data(X_test)

In [394]:
X_test_3 = X_test_3.drop(columns = ['Unnamed: 0', 'passenger_id', 'sex', 'sibsp', 'parch', 'embark_town', 'Q', 'S', 'embark_town_Queenstown', 'embark_town_Southampton'])

In [395]:
X_test_3.head()

Unnamed: 0,pclass,age,fare,alone,sex_male
561,3,40.0,7.8958,1,1
328,3,31.0,20.525,0,0
643,3,29.916875,56.4958,1,1
498,1,25.0,151.55,0,0
875,3,15.0,7.225,1,0


In [398]:
y_pred_proba_test_3 = logit3.predict_proba(X_test_3)

In [399]:
df_classification_report_test_3 = pd.DataFrame(classification_report(y_test, y_pred_test_3, output_dict = True)).T

In [400]:
df_classification_report_test_3

Unnamed: 0,precision,recall,f1-score,support
0,0.851852,0.836364,0.844037,110.0
1,0.742857,0.764706,0.753623,68.0
accuracy,0.808989,0.808989,0.808989,0.808989
macro avg,0.797354,0.800535,0.79883,178.0
weighted avg,0.810213,0.808989,0.809497,178.0


This model performs even better on test than on train and validate.