In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import prepare
import acquire
import preprocess

In these exercises, we'll continue working with the titanic dataset and building logistic regression models. 

1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

3. Try out other combinations of features and models.

4. Use you best 3 models to predict and evaluate on your validate sample.

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [2]:
titanic_db = acquire.get_titanic_db()
titanic_db.head(3)

this file exists, reading from csv file


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1


In [3]:
# looking for total null value in age column
titanic_db['age'].isna().sum()

177

In [4]:
# the mode of specific age
titanic_db['age'].mode()

0    24.0
Name: age, dtype: float64

In [6]:
# filling all null value in age column with mode
# looking for any null value that is left in age column
titanic_db['age'] = titanic_db['age'].fillna(24.0)
titanic_db.age.isnull().sum()

0

In [9]:
# splitting the date
train, validate, test = prepare.splitting_data(prepare.prep_titanic(titanic_db), 'survived')

In [10]:
train.head(3)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
776,776,0,3,male,24.0,0,0,7.75,Queenstown,1
829,829,1,1,female,62.0,0,0,80.0,Southampton,1
215,215,1,1,female,31.0,1,0,113.275,Cherbourg,0


In [11]:
train_encoded, validate_encoded, test_encoded = preprocess.preprocess_titanic(train, validate, test)

In [12]:
train_encoded.head(3)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,0,3,24.0,0,0,7.75,1,1,0,1
829,1,1,62.0,0,0,80.0,1,0,1,0
215,1,1,31.0,1,0,113.275,0,0,0,0


In [13]:
# Isolate the target variable as its only column 
y_train = train_encoded['survived']
y_validate = validate_encoded['survived']
y_test = test_encoded['survived']

# Removing target variable for modeling
X_train = train_encoded.drop(columns=['survived'])
X_validate = validate_encoded.drop(columns=['survived'])
X_test = test_encoded.drop(columns=['survived'])

In [14]:
# Setting up my Baseline
y_train.mode()

0    0
Name: survived, dtype: int64

In [15]:
# Taking the mean of the mode
(y_train == 0).mean()

0.6161048689138576

> My baseline accuracy is 62%

1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [16]:
X_train.head(3)

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,3,24.0,0,0,7.75,1,1,0,1
829,1,62.0,0,0,80.0,1,0,1,0
215,1,31.0,1,0,113.275,0,0,0,0


> #### 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [47]:
# Using features age, fare, and pclass
features = ['fare', 'pclass', 'age']

In [48]:
# Setting up to use Logistic Regression
lr = LogisticRegression()
lr

In [49]:
# Fit the model
lr.fit(X_train[features], y_train)

In [50]:
# Accuracy score with features = ['fare', 'pclass', 'age']
lr.score(X_train[features], y_train)

0.702247191011236

In [51]:
# Validation score for ffeatures = ['fare', 'pclass', 'age']
lr.score(X_validate[features], y_validate)

0.6910112359550562

> Using Logistic Regression with only age, fare, and pclass performed better than the baseline 62%
>> Accuracy score: 70% and Validation score: 70%

> #### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [26]:
X_train.head(3)

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,3,24.0,0,0,7.75,1,1,0,1
829,1,62.0,0,0,80.0,1,0,1,0
215,1,31.0,1,0,113.275,0,0,0,0


In [52]:
# added sex_male to features
features1 = ['fare', 'pclass', 'age', 'sex_male']

In [53]:
# Fitting the values
lr.fit(X_train[features1], y_train)

In [54]:
# Taking accuracy score with features1 = ['fare', 'pclass', 'age', 'sex_male']
lr.score(X_train[features1], y_train)

0.799625468164794

In [55]:
# Validation score for features1 = ['fare', 'pclass', 'age', 'sex_male']
lr.score(X_validate[features1], y_validate)

0.7640449438202247

> features1 = ['fare', 'pclass', 'age', 'sex_male'] performed better compared to the baseline and features with only age, fare, and pclass

>> Accuracy score of 80% and Validation score: 76%

> #### 3. Try out other combinations of features and models.

In [40]:
X_train.head(3)

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,3,24.0,0,0,7.75,1,1,0,1
829,1,62.0,0,0,80.0,1,0,1,0
215,1,31.0,1,0,113.275,0,0,0,0


In [56]:
# added another column = alone from the last one
features2 = ['fare', 'pclass', 'age', 'sex_male', 'alone']

In [57]:
# Fitting the features2 = ['fare', 'pclass', 'age', 'sex_male', 'alone']
lr.fit(X_train[features2], y_train)

In [58]:
# Getting the accuracy score with features2 = ['fare', 'pclass', 'age', 'sex_male', 'alone']
lr.score(X_train[features2], y_train)

0.799625468164794

In [59]:
# Validation score for features2 = ['fare', 'pclass', 'age', 'sex_male', 'alone']
lr.score(X_validate[features2], y_validate)

0.7584269662921348

> The accuracy score didn't not improve with features2 = ['fare', 'pclass', 'age', 'sex_male', 'alone']

> The score was the same with features1

>> Accuracy score: 80% and Validation score: 76%

> #### 4. Use you best 3 models to predict and evaluate on your validate sample.

> features = ['fare', 'pclass', 'age']
>> Accuracy score: 70% and Validation score: 70%

> features1 = ['fare', 'pclass', 'age', 'sex_male']
>> Accuracy score of 80% and Validation score: 76%

> features2 = ['fare', 'pclass', 'age', 'sex_male', 'alone']
>> Accuracy score: 80% and Validation score: 76%

> #### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

> ##### The best model is features1 with four added features

> features1 = ['fare', 'pclass', 'age', 'sex_male']
>> Accuracy score of 80% and Validation score: 76%