## In these exercises, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

In [2]:
df = acquire.get_titanic_data()

In [3]:
df = prepare.prep_titanic(df)
df.head()

Unnamed: 0,passenger_id,survived,sex,age,sibsp,parch,fare,class,deck,embark_town,alone,sex_male,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,0,0,male,22.0,1,0,7.25,Third,,Southampton,0,1,0,1,0,1
1,1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,0,0,0,0,0,0
2,2,1,female,26.0,0,0,7.925,Third,,Southampton,1,0,0,1,0,1
3,3,1,female,35.0,1,0,53.1,First,C,Southampton,0,0,0,0,0,1
4,4,0,male,35.0,0,0,8.05,Third,,Southampton,1,1,0,1,0,1


In [9]:
# Train, validate, test split
train, validate, test = prepare.split(df, stratify_by='survived')

# Handling nulls in age
train.age = train.age.fillna(train.age.mean())
validate.age = validate.age.fillna(validate.age.mean())
test.age = test.age.fillna(test.age.mean())
# Create X and y objects

X_train = train[['age', 'fare', 'class_Second', 'class_Third']]
y_train = train.survived


X_validate = validate[['age', 'fare', 'class_Second', 'class_Third']]
y_validate = validate.survived

X_test = test[['age', 'fare', 'class_Second', 'class_Third']]
y_test = test.survived

In [10]:
X_train.head()

Unnamed: 0,age,fare,class_Second,class_Third
583,36.0,40.125,0,0
165,9.0,20.525,0,1
50,7.0,39.6875,0,1
259,50.0,26.0,1,0
306,29.678105,110.8833,0,0


## 1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?

In [19]:
# Define the logistic regression model
logit = LogisticRegression(C=1, random_state=123)

In [20]:
#  fit the model on train data
logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [22]:
# now use the model to make predictions
y_pred = logit.predict(X_train)

In [23]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.81      0.76       307
           1       0.62      0.49      0.54       191

    accuracy                           0.69       498
   macro avg       0.67      0.65      0.65       498
weighted avg       0.68      0.69      0.68       498



## 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [51]:
# Create X and y objects

X_train2 = train[['age', 'fare', 'class_Second', 'class_Third', 'sex_male']]
y_train2 = train.survived


X_validate2 = validate[['age', 'fare', 'class_Second', 'class_Third', 'sex_male']]
y_validate2 = validate.survived

X_test2 = test[['age', 'fare', 'class_Second', 'class_Third', 'sex_male']]
y_test2 = test.survived

In [49]:
# Define the logistic regression model
logit2 = LogisticRegression(C=1, random_state=123)

In [52]:
#  fit the model on train data
logit2.fit(X_train2, y_train2)

LogisticRegression(C=1, random_state=123)

In [53]:
# now use the model to make predictions
y_pred2 = logit2.predict(X_train2)

In [55]:
# classification report
print(classification_report(y_train2, y_pred2))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       307
           1       0.77      0.69      0.73       191

    accuracy                           0.80       498
   macro avg       0.80      0.78      0.79       498
weighted avg       0.80      0.80      0.80       498



## 3. Try out other combinations of features and models.

#### Model 3

In [37]:
train.head()

Unnamed: 0,passenger_id,survived,sex,age,sibsp,parch,fare,class,deck,embark_town,alone,sex_male,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,583,0,male,36.0,0,0,40.125,First,A,Cherbourg,1,1,0,0,0,0
165,165,1,male,9.0,0,2,20.525,Third,,Southampton,0,1,0,1,0,1
50,50,0,male,7.0,4,1,39.6875,Third,,Southampton,0,1,0,1,0,1
259,259,1,female,50.0,0,1,26.0,Second,,Southampton,0,0,1,0,0,1
306,306,1,female,29.678105,0,0,110.8833,First,,Cherbourg,1,0,0,0,0,0


In [56]:
# Create X and y objects
X_train3 = train[['age', 'fare', 'class_Second', 'class_Third', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton', 'alone']]
y_train3 = train.survived

X_validate3 = validate[['age', 'fare', 'class_Second', 'class_Third', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton', 'alone']]
y_validate3 = validate.survived

X_test3 = test[['age', 'fare', 'class_Second', 'class_Third', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton', 'alone']]
y_test3 = test.survived

In [57]:
# Define the logistic regression model
logit3 = LogisticRegression(C=1, random_state=123)

In [58]:
#  fit the model on train data
logit3.fit(X_train3, y_train3)

LogisticRegression(C=1, random_state=123)

In [59]:
# now use the model to make predictions
y_pred3 = logit3.predict(X_train3)

In [60]:
# classification report
print(classification_report(y_train3, y_pred3))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       307
           1       0.79      0.69      0.74       191

    accuracy                           0.81       498
   macro avg       0.80      0.79      0.79       498
weighted avg       0.81      0.81      0.81       498



--------------------------------------------------------------------------------------

#### Model 4

In [61]:
# Create X and y objects
X_train4 = train[['age', 'fare', 'class_Second', 'class_Third', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton', 'alone', 'sibsp', 'parch']]
y_train4 = train.survived

X_validate4 = validate[['age', 'fare', 'class_Second', 'class_Third', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton', 'alone', 'sibsp', 'parch']]
y_validate4 = validate.survived

X_test4 = test[['age', 'fare', 'class_Second', 'class_Third', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton', 'alone', 'sibsp', 'parch']]
y_test4 = test.survived

In [62]:
# Define the logistic regression model
logit4 = LogisticRegression(C=1, random_state=123)

In [63]:
#  fit the model on train data
logit4.fit(X_train4, y_train4)

LogisticRegression(C=1, random_state=123)

In [64]:
# now use the model to make predictions
y_pred4 = logit4.predict(X_train4)

In [65]:
# classification report
print(classification_report(y_train4, y_pred4))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       307
           1       0.80      0.71      0.75       191

    accuracy                           0.82       498
   macro avg       0.81      0.80      0.80       498
weighted avg       0.82      0.82      0.82       498



## 4. Use you best 3 models to predict and evaluate on your validate sample.

In [68]:
# Make prediction for validate dataset

y_pred_validate2 = logit2.predict(X_validate2)
y_pred_validate3 = logit3.predict(X_validate3)
y_pred_validate4 = logit4.predict(X_validate4)
print("Model 2: c = 1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate2, y_validate2)))

print(classification_report(y_validate2, y_pred_validate2))

print("Model 3: c = 1")

print('Accuracy: {:.2f}'.format(logit3.score(X_validate3, y_validate3)))

print(classification_report(y_validate3, y_pred_validate3))

print("Model 4: c = 1")

print('Accuracy: {:.2f}'.format(logit4.score(X_validate4, y_validate4)))

print(classification_report(y_validate4, y_pred_validate4))

Model 2: c = 1
Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.77      0.78      0.77       214

Model 3: c = 1
Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       132
           1       0.73      0.65      0.68        82

    accuracy                           0.77       214
   macro avg       0.76      0.75      0.75       214
weighted avg       0.77      0.77      0.77       214

Model 4: c = 1
Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       132
           1       0.76      0.65      0.70        82

    accuracy                           0.79       214
   macro avg       0.78      0.76     

## 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [69]:
y_pred_test4 = logit4.predict(X_test4)

In [70]:
print("Model 4: c = 1")

print('Accuracy: {:.2f}'.format(logit4.score(X_test4, y_test4)))

print(classification_report(y_test4, y_pred_test4))

Model 4: c = 1
Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       110
           1       0.79      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179

