In [2]:
import aquire
import prepare
from prepare import train_validate_test_split_1
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Exercises

# In these exercises, we'll continue working with the titanic dataset and building logistic 
# regression models. Throughout this exercise, be sure you are training, evaluation, 
# and comparing models on the train and validate datasets. The test dataset should only be
# used for your final model.

# For all of the models you create, choose a threshold that optimizes for accuracy.

# Do your work for these exercises in either a notebook or a python script named model within 
# your classification-exercises repository. Add, commit, and push your work.

df = aquire.get_titanic_data()
df = df.drop(columns='deck')
df = df[~ df.age.isna()]
df = df[~ df.embarked.isna()]
df = df.drop(columns=['embarked', 'class', 'passenger_id'])
df["is_female"] = df.sex == 1
dummy_df = pd.get_dummies(df[["embark_town"]], drop_first=True)
df = pd.concat([df, dummy_df], axis=1)
df = df.drop(columns=['sex', 'embark_town'])
avg_age = df.age.mean()
df.age = df.age.fillna(avg_age)
df.head() 

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,False,0,1
1,1,1,38.0,1,0,71.2833,0,False,0,0
2,1,3,26.0,0,0,7.925,1,False,0,1
3,1,1,35.0,1,0,53.1,0,False,0,1
4,0,3,35.0,0,0,8.05,1,False,0,1


In [4]:
train, validate, test = train_validate_test_split_1(df,
                                                  target = 'survived',
                                                  seed=123)

In [36]:
# Make new dataframes
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [37]:
df.survived.value_counts().head() # (find max to set to 0)

0    424
1    288
Name: survived, dtype: int64

In [38]:
baseline_accuracy = (train.survived == 0).mean()
round(baseline_accuracy, 3)

0.595

In [39]:
# 1) Create a model that includes age in addition to fare and pclass. Does this model perform 
# better than your baseline?

In [49]:
logit = LogisticRegression(random_state = 123)

features = ["age", "pclass", "fare"]

logit.fit(X_train[features], y_train)

y_pred = logit.predict(X_train[features])

accuracy = logit.score(X_train[features], y_train)
accuracy

0.7211055276381909

In [50]:
# 2) Include sex in your model as well. Note that you'll need to encode or create a dummy variable
# of this feature before including it in a model.

In [51]:
# Create
logit1 = LogisticRegression(random_state = 123)
# Specify
features = ["age", "fare", "is_female"]
# Fit
logit1.fit(X_train[features], y_train)
# Predict
y_pred = logit1.predict(X_train[features])

accuracy1 = logit1.score(X_train[features], y_train)
accuracy1

0.6582914572864321

In [52]:
# 3) Try out other combinations of features and models.

In [53]:
# Create (Rinse and repeate this cell to make new models)
logit2 = LogisticRegression(random_state = 123)
# Specify
features = ["age", "pclass", "fare", "is_female", "alone"]
# Fit
logit2.fit(X_train[features], y_train)
# Predict
y_pred = logit2.predict(X_train[features])

accuracy2 = logit2.score(X_train[features], y_train)
accuracy2

0.7261306532663316

In [55]:
# 4) Use you best 3 models to predict and evaluate on your validate sample.

In [54]:
logit
logit1
logit2

LogisticRegression(random_state=123)

In [66]:
features = ["age", "pclass", "fare"]
y_pred_validate = logit.predict(X_validate[features])

print(classification_report(y_validate, y_pred_validate))

              precision    recall  f1-score   support

           0       0.70      0.85      0.77       102
           1       0.67      0.45      0.54        69

    accuracy                           0.69       171
   macro avg       0.68      0.65      0.65       171
weighted avg       0.69      0.69      0.67       171



In [65]:
features = ["age", "fare", "is_female"]
y_pred_validate = logit1.predict(X_validate[features])

print(classification_report(y_validate, y_pred_validate))

              precision    recall  f1-score   support

           0       0.66      0.91      0.77       102
           1       0.71      0.32      0.44        69

    accuracy                           0.67       171
   macro avg       0.69      0.62      0.60       171
weighted avg       0.68      0.67      0.64       171



In [64]:
features = ["age", "pclass", "fare", "is_female", "alone"]
y_pred_validate = logit2.predict(X_validate[features])

print(classification_report(y_validate, y_pred_validate))

              precision    recall  f1-score   support

           0       0.69      0.82      0.75       102
           1       0.64      0.46      0.54        69

    accuracy                           0.68       171
   macro avg       0.67      0.64      0.65       171
weighted avg       0.67      0.68      0.67       171



In [74]:
# Check with base-line accuracy to see which models are overfit 
round(baseline_accuracy, 3)

0.595

In [None]:
# 5) Choose you best model from the validation performation, and evaluate it on the test dataset.
# How do the performance metrics compare to validate? to train?


In [67]:
logit

LogisticRegression(random_state=123)

In [76]:
# Make prediction on X_test using model 1
y_pred_test = logit.predict(X_test[features])

In [77]:
# print classification report (a little overfitting but not bad)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78        85
           1       0.71      0.50      0.59        58

    accuracy                           0.71       143
   macro avg       0.71      0.68      0.68       143
weighted avg       0.71      0.71      0.70       143



In [None]:
# This model is a little betetr than baseline and validate model. 