In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import prepare

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')

titanic = pd.read_csv('titanic.csv')

In [125]:
def prep_titanic(titanic):
        
    titanic = titanic.drop_duplicates()
    
    drop_columns = ['passenger_id','class','embarked','deck','age']
    titanic = titanic.drop(columns=drop_columns)

    titanic['embark_town'] = titanic.embark_town.fillna(value='Southampton')
    
    dummy_name = pd.get_dummies(titanic[['sex','embark_town']],drop_first=[True,True])
    titanic = pd.concat([titanic,dummy_name],axis=1)
    
    titanic = titanic.drop(columns=['sex','embark_town'])
    
    return titanic

In [126]:
titanic = prep_titanic(titanic)
titanic.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,1,0,1
1,1,1,1,0,71.2833,0,0,0,0
2,1,3,0,0,7.925,1,0,0,1
3,1,1,1,0,53.1,0,0,0,1
4,0,3,0,0,8.05,1,1,0,1


In [127]:
train_validate, test = train_test_split(titanic, 
                                         test_size=0.2, 
                                        random_state=123, 
                                        stratify=titanic.survived)

train, validate = train_test_split(train_validate, 
                                 test_size=0.3, 
                                   random_state=123, 
                                   stratify=train_validate.survived)

x_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

**Decision Tree Exercises**

- What is your baseline prediction?

In [128]:
train.survived.value_counts()
## Baseline prediction would be that they did not survive
train['baseline_death'] = 0

- What is your baseline accuracy?

In [129]:
(train.baseline_death == train.survived).mean()

0.6164658634538153

Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [130]:
clf = DecisionTreeClassifier()

In [131]:
clf = clf.fit(x_train, y_train)

In [132]:
y_pred = clf.predict(x_train)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [133]:
score = clf.score(x_train,y_train)
score

0.9457831325301205

In [134]:
conf_mat = confusion_matrix(y_train,y_pred)
conf_mat

array([[305,   2],
       [ 25, 166]])

In [135]:
class_rep = pd.DataFrame(classification_report(y_train,y_pred,output_dict=True)).T
class_rep

Unnamed: 0,precision,recall,f1-score,support
0,0.924242,0.993485,0.957614,307.0
1,0.988095,0.86911,0.924791,191.0
accuracy,0.945783,0.945783,0.945783,0.945783
macro avg,0.956169,0.931298,0.941202,498.0
weighted avg,0.948732,0.945783,0.945025,498.0


Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [136]:
true_pos = conf_mat[0][0]
true_pos

305

In [137]:
false_pos = conf_mat[0][1]
false_pos

2

In [138]:
true_neg = conf_mat[1][1]
true_neg

166

In [139]:
false_neg = conf_mat[1][0]
false_neg

25

In [140]:
class_rep

Unnamed: 0,precision,recall,f1-score,support
0,0.924242,0.993485,0.957614,307.0
1,0.988095,0.86911,0.924791,191.0
accuracy,0.945783,0.945783,0.945783,0.945783
macro avg,0.956169,0.931298,0.941202,498.0
weighted avg,0.948732,0.945783,0.945025,498.0


Run through steps 2-4 using a different max_depth value.

In [141]:
clf2 = DecisionTreeClassifier(max_depth=3)
clf2 = clf2.fit(x_train,y_train)
y_pred2 = clf2.predict(x_train)

In [142]:
conf_mat2 = confusion_matrix(y_train,y_pred2)
conf_mat2

array([[276,  31],
       [ 57, 134]])

In [143]:
class_rep2 = pd.DataFrame(classification_report(y_train,y_pred2,output_dict=True)).T
class_rep2

Unnamed: 0,precision,recall,f1-score,support
0,0.828829,0.899023,0.8625,307.0
1,0.812121,0.701571,0.752809,191.0
accuracy,0.823293,0.823293,0.823293,0.823293
macro avg,0.820475,0.800297,0.807654,498.0
weighted avg,0.822421,0.823293,0.82043,498.0


Which model performs better on your in-sample data?

- Model 1 performed better on the in-sample data.

Which model performs best on your out-of-sample data, the validate set?

In [144]:
y_pred_val = clf.predict(x_validate)
y_pred2_val = clf2.predict(x_validate)

In [145]:
class_rep_val = pd.DataFrame(classification_report(y_validate,y_pred_val,output_dict=True)).T
class_rep_val

Unnamed: 0,precision,recall,f1-score,support
0,0.784173,0.825758,0.804428,132.0
1,0.693333,0.634146,0.66242,82.0
accuracy,0.752336,0.752336,0.752336,0.752336
macro avg,0.738753,0.729952,0.733424,214.0
weighted avg,0.749365,0.752336,0.750014,214.0


In [146]:
conf_mat_val = confusion_matrix(y_validate,y_pred_val)
conf_mat_val

array([[109,  23],
       [ 30,  52]])

In [147]:
class_rep2_val = pd.DataFrame(classification_report(y_validate,y_pred2_val,output_dict=True)).T
class_rep2_val

Unnamed: 0,precision,recall,f1-score,support
0,0.798611,0.871212,0.833333,132.0
1,0.757143,0.646341,0.697368,82.0
accuracy,0.785047,0.785047,0.785047,0.785047
macro avg,0.777877,0.758777,0.765351,214.0
weighted avg,0.782721,0.785047,0.781235,214.0


In [148]:
conf_mat2_val = confusion_matrix(y_validate,y_pred2_val)
conf_mat2_val

array([[115,  17],
       [ 29,  53]])

- Model 2 seemd to perform slightly better on the out-of-sample data.

Work through these same exercises using the Telco dataset.
Experiment with this model on other datasets with a higher number of output classes.

**Random Forest Exercises (Titanic)**

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [149]:
rf = RandomForestClassifier(min_samples_leaf=1,max_depth=10,random_state=123)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_train)

Evaluate your results using the model score, confusion matrix, and classification report.

In [150]:
print(rf.feature_importances_)
print(x_train.columns)

[0.10002143 0.06879504 0.05292745 0.3838233  0.02319394 0.32803056
 0.01522288 0.02798539]
Index(['pclass', 'sibsp', 'parch', 'fare', 'alone', 'sex_male',
       'embark_town_Queenstown', 'embark_town_Southampton'],
      dtype='object')


In [151]:
rf.score(x_train,y_train)

0.9437751004016064

In [152]:
print(confusion_matrix(y_train,y_pred))

[[301   6]
 [ 22 169]]


In [153]:
pd.DataFrame(classification_report(y_train,y_pred, output_dict= True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.931889,0.980456,0.955556,307.0
1,0.965714,0.884817,0.923497,191.0
accuracy,0.943775,0.943775,0.943775,0.943775
macro avg,0.948801,0.932636,0.939526,498.0
weighted avg,0.944862,0.943775,0.94326,498.0


Print and clearly label the following: 
- Accuracy 
- true positive rate 
- false positive rate
- true negative rate
- false negative rate
- precision, recall
- f1-score
- support.

In [154]:
print(f'\
Accuracy: 0.94\
\ntrue positive rate: 301/307 = 0.98\
\nfalse positive rate: 6/307 = 0.02\
\ntrue negative rate: 169/191 = 0.88\
\nfalse negative rate: 22/191 = 0.12\
\nprecision: 0.95\
\nrecall: 0.93\
\nf1-score: 0.94\
\nsupport: 0: 307, 1:191\
')

Accuracy: 0.94
true positive rate: 301/307 = 0.98
false positive rate: 6/307 = 0.02
true negative rate: 169/191 = 0.88
false negative rate: 22/191 = 0.12
precision: 0.95
recall: 0.93
f1-score: 0.94
support: 0: 307, 1:191


Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [155]:
rf = RandomForestClassifier(min_samples_leaf=3,max_depth=5,random_state=123)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_train)

In [156]:
conf = confusion_matrix(y_train,y_pred)
print(f'\
Accuracy: {rf.score(x_train, y_train):.3}\
\ntrue positive rate: {conf[1][1] / conf[1].sum():.3}\
\nfalse positive rate: {conf[0][1] / conf[0].sum():.3}\
\ntrue negative rate: {conf[0][0] / conf[0].sum():.3}\
\nfalse negative rate: {conf[1][0] / conf[1].sum():.3}\
')
print(pd.DataFrame(classification_report(y_train,y_pred, output_dict= True)).T,f'\n')

Accuracy: 0.837
true positive rate: 0.665
false positive rate: 0.0554
true negative rate: 0.945
false negative rate: 0.335
              precision    recall  f1-score     support
0              0.819209  0.944625  0.877458  307.000000
1              0.881944  0.664921  0.758209  191.000000
accuracy       0.837349  0.837349  0.837349    0.837349
macro avg      0.850577  0.804773  0.817834  498.000000
weighted avg   0.843270  0.837349  0.831722  498.000000 



- What are the differences in the evaluation metrics? 
- Which performs better on your in-sample data?
- Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

**KNN Exercises (Titanic)**

Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [157]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn.fit(x_train,y_train)
y_pred = knn.predict(x_train)
y_pred_proba = knn.predict_proba(x_train)

Evaluate your results using the model score, confusion matrix, and classification report.

In [158]:
knn.score(x_train, y_train)

0.8072289156626506

In [159]:
confusion_matrix(y_train, y_pred)

array([[265,  42],
       [ 54, 137]])

In [160]:
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.830721,0.863192,0.846645,307.0
1,0.765363,0.717277,0.740541,191.0
accuracy,0.807229,0.807229,0.807229,0.807229
macro avg,0.798042,0.790235,0.793593,498.0
weighted avg,0.805654,0.807229,0.805951,498.0


Print and clearly label the following: 
- Accuracy
- true positive rate
- false positive rate
- true negative rate
- false negative rate
- precision, recall
- f1-score
- support

In [161]:
conf = confusion_matrix(y_train,y_pred)
print(f'\
   Accuracy: {knn.score(x_train, y_train):.3}\
\n true positive rate: {conf[1][1] / conf[1].sum():.3}\
\n false positive rate: {conf[0][1] / conf[0].sum():.3}\
\n true negative rate: {conf[0][0] / conf[0].sum():.3}\
\n false negative rate: {conf[1][0] / conf[1].sum():.3}')
print(pd.DataFrame(classification_report(y_train,y_pred, output_dict= True)).T,f'\n')

   Accuracy: 0.807
 true positive rate: 0.717
 false positive rate: 0.137
 true negative rate: 0.863
 false negative rate: 0.283
              precision    recall  f1-score     support
0              0.830721  0.863192  0.846645  307.000000
1              0.765363  0.717277  0.740541  191.000000
accuracy       0.807229  0.807229  0.807229    0.807229
macro avg      0.798042  0.790235  0.793593  498.000000
weighted avg   0.805654  0.807229  0.805951  498.000000 



Run through steps 2-4 setting k to 10

In [162]:
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')
knn.fit(x_train,y_train)
y_pred = knn.predict(x_train)
y_pred_proba = knn.predict_proba(x_train)

In [163]:
conf = confusion_matrix(y_train,y_pred)
print(f'\
Accuracy: {knn.score(x_train, y_train):.3}\
\ntrue positive rate: {conf[1][1] / conf[1].sum():.3}\
\nfalse positive rate: {conf[0][1] / conf[0].sum():.3}\
\ntrue negative rate: {conf[0][0] / conf[0].sum():.3}\
\nfalse negative rate: {conf[1][0] / conf[1].sum():.3}\
')
print(pd.DataFrame(classification_report(y_train,y_pred, output_dict= True)).T,f'\n')

Accuracy: 0.783
true positive rate: 0.644
false positive rate: 0.13
true negative rate: 0.87
false negative rate: 0.356
              precision    recall  f1-score     support
0              0.797015  0.869707  0.831776  307.000000
1              0.754601  0.643979  0.694915  191.000000
accuracy       0.783133  0.783133  0.783133    0.783133
macro avg      0.775808  0.756843  0.763345  498.000000
weighted avg   0.780748  0.783133  0.779285  498.000000 



Run through setps 2-4 setting k to 20

In [164]:
knn = KNeighborsClassifier(n_neighbors=20, weights='uniform')
knn.fit(x_train,y_train)
y_pred = knn.predict(x_train)
y_pred_proba = knn.predict_proba(x_train)

In [165]:
conf = confusion_matrix(y_train,y_pred)
print(f'\
Accuracy: {knn.score(x_train, y_train):.3}\
\ntrue positive rate: {conf[1][1] / conf[1].sum():.3}\
\nfalse positive rate: {conf[0][1] / conf[0].sum():.3}\
\ntrue negative rate: {conf[0][0] / conf[0].sum():.3}\
\nfalse negative rate: {conf[1][0] / conf[1].sum():.3}\
')
print(pd.DataFrame(classification_report(y_train,y_pred, output_dict= True)).T,f'\n')

Accuracy: 0.737
true positive rate: 0.545
false positive rate: 0.143
true negative rate: 0.857
false negative rate: 0.455
              precision    recall  f1-score     support
0              0.751429  0.856678  0.800609  307.000000
1              0.702703  0.544503  0.613569  191.000000
accuracy       0.736948  0.736948  0.736948    0.736948
macro avg      0.727066  0.700590  0.707089  498.000000
weighted avg   0.732741  0.736948  0.728873  498.000000 



- What are the differences in the evaluation metrics? 
- Which performs better on your in-sample data? 
- Why?

Which model performs best on our out-of-sample data from validate?

**Logistic Regression Exercises (Titanic)**

Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [190]:
titanic = pd.read_csv('titanic.csv')

avg_age = titanic.age.mean()
titanic.age = titanic.age.fillna(avg_age)

titanic["is_female"] = (titanic.sex == "female").astype('int')

dummy_df = pd.get_dummies(titanic[['embark_town']], dummy_na=False, drop_first=True)
titanic = pd.concat([titanic, dummy_df], axis=1)

titanic = titanic.drop(columns=["passenger_id", "deck", "class", "embarked", "sex", "embark_town"])

train_validate, test = train_test_split(titanic, 
                                         test_size=0.2, 
                                        random_state=123, 
                                        stratify=titanic.survived)

train, validate = train_test_split(train_validate, 
                                 test_size=0.3, 
                                   random_state=123, 
                                   stratify=train_validate.survived)

x_train = train.drop(columns=['survived'])
y_train = train.survived

x_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

x_test = test.drop(columns=['survived'])
y_test = test.survived

x_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton
583,1,36.0,0,0,40.125,1,0,0,0
165,3,9.0,0,2,20.525,0,0,0,1
50,3,7.0,4,1,39.6875,0,0,0,1
259,2,50.0,0,1,26.0,0,1,0,1
306,1,29.699118,0,0,110.8833,1,1,0,0


In [191]:
logit = LogisticRegression(C=1, random_state=123)

features = ['age','fare','pclass']

logit.fit(x_train[features],y_train)

y_pred = logit.predict(x_train[features])

In [192]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [193]:
baseline = (train.survived == 0).mean()

In [194]:
print(f'Accuracy\n--------\n\
baseline = {baseline}\n\
model = {logit.score(x_train[features], y_train)}')

Accuracy
--------
baseline = 0.6164658634538153
model = 0.7028112449799196


Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [210]:
features = ['age','fare','pclass','is_female']

def logReg(x, y, features, c):
    
    logit = LogisticRegression(C=c, random_state=123)

    logit.fit(x[features],y)
    y_pred = logit.predict(x[features])

    report = print(f'Accuracy\n--------\n\
             baseline = {baseline}\n\
             model = {logit.score(x[features], y)}')

    return report

logReg(x_train, y_train, features, 1)

Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.8132530120481928


Try out other combinations of features and models.

In [212]:
features = ['age','fare','pclass','is_female']

logReg(x_train, y_train, features, 10)

Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.8092369477911646


In [213]:
features = x_train.columns

logReg(x_train, y_train, features, 10)


Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.8152610441767069


In [214]:
features = x_train.columns

logReg(x_train, y_train, features, 1)


Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.8152610441767069


In [215]:
features = ['age','fare','pclass','is_female']

logReg(x_train, y_train, features, 0.1)


Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.7831325301204819


In [216]:
features = x_train.columns

logReg(x_train, y_train, features, 0.1)


Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.8253012048192772


Use you best 3 models to predict and evaluate on your validate sample.

In [217]:
features = x_validate.columns

logReg(x_validate, y_validate, features, 0.1)


Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.7757009345794392


In [218]:
features = x_validate.columns

logReg(x_validate, y_validate, features, 1)

Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.7990654205607477


In [219]:
features = ['age','fare','pclass','is_female']

logReg(x_validate, y_validate, features, 1)


Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.7850467289719626


Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [222]:
features = x_test.columns

logReg(x_test, y_test, features, 1)

Accuracy
--------
             baseline = 0.6164658634538153
             model = 0.8100558659217877


Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

Bonus2: How do different strategies for encoding sex affect model performance?

Bonus3: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

- C = 0.01
- C = 0.1
- C = 1
- C = 10
- C = 100
- C = 1000

Bonus Bonus: how does scaling the data interact with your choice of C?