In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import acquire
import prepare

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
titanic = acquire.get_titanic_data()
#titanic = prepare.prep_titantic(titanic)

titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           714 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      889 non-null    object 
 9   class         891 non-null    object 
 10  deck          203 non-null    object 
 11  embark_town   889 non-null    object 
 12  alone         891 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [3]:
titanic_df = titanic.drop(columns = ['passenger_id', 'sibsp', 'parch', 'embarked', 'class', 'deck', 'embark_town', 'alone' ])
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   fare      891 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [4]:
titanic_df = titanic_df.dropna()

In [5]:
titanic_df['age'].isna().value_counts()

False    714
Name: age, dtype: int64

In [6]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  714 non-null    int64  
 1   pclass    714 non-null    int64  
 2   sex       714 non-null    object 
 3   age       714 non-null    float64
 4   fare      714 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 33.5+ KB


In [7]:
col = 'survived'
train, validate, test = prepare.train_val_test(titanic_df, col)

In [8]:
X_train = train.drop(columns=['survived', 'sex'])
y_train = train.survived

X_validate = validate.drop(columns=['survived', 'sex'])
y_validate = validate.survived

X_test = test.drop(columns=['survived', 'sex'])
y_test = test.survived

In [9]:
X_train.shape, X_validate.shape, X_test.shape

((499, 3), (249, 3), (250, 3))

In [10]:
X_train

Unnamed: 0,pclass,age,fare
371,3,18.0,6.4958
145,2,19.0,36.7500
313,3,28.0,7.8958
116,3,70.5,7.7500
238,2,19.0,10.5000
...,...,...,...
735,3,28.5,16.1000
728,2,25.0,26.0000
516,2,34.0,10.5000
862,1,48.0,25.9292


In [11]:
train.survived.value_counts(normalize=True)

0    0.593186
1    0.406814
Name: survived, dtype: float64

In [12]:
baseline_accuracy = (train.survived == 0).mean()
print(f'{baseline_accuracy:.2%}')

59.32%


## 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [13]:
seed = 42

logit = LogisticRegression(random_state = seed)

In [14]:
features = ["age", "pclass", "fare"]

logit.fit(X_train[features], y_train)

In [15]:
y_pred = logit.predict(X_train[features])

print(f'Baseline is: {baseline_accuracy:.2%}')
print("Logistic Regression: using age, pclass, and fare features")
print(f'Accuracy of Logistic Regression classifier on training set: {logit.score(X_train[features], y_train):.2%}')

Baseline is: 59.32%
Logistic Regression: using age, pclass, and fare features
Accuracy of Logistic Regression classifier on training set: 71.54%


In [16]:
# Yes this model performs better than my baseline

## 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [17]:
dummies = pd.get_dummies(titanic['sex'], drop_first = True)

In [18]:
titanic = pd.concat([titanic, dummies], axis = 1)
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,male
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1,0
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1,0
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1,1


In [19]:
titanic_df_age = titanic.drop(columns = ['passenger_id', 'sibsp', 'parch', 'embarked', 'class', 'deck', 'embark_town', 'alone', 'sex' ])

titanic_df_age

Unnamed: 0,survived,pclass,age,fare,male
0,0,3,22.0,7.2500,1
1,1,1,38.0,71.2833,0
2,1,3,26.0,7.9250,0
3,1,1,35.0,53.1000,0
4,0,3,35.0,8.0500,1
...,...,...,...,...,...
886,0,2,27.0,13.0000,1
887,1,1,19.0,30.0000,0
888,0,3,,23.4500,0
889,1,1,26.0,30.0000,1


In [20]:
titanic_df_age = titanic_df_age.dropna()

In [21]:
col = 'survived'
train, validate, test = prepare.train_val_test(titanic_df_age, col)

In [22]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [23]:
X_train

Unnamed: 0,pclass,age,fare,male
371,3,18.0,6.4958,1
145,2,19.0,36.7500,1
313,3,28.0,7.8958,1
116,3,70.5,7.7500,1
238,2,19.0,10.5000,1
...,...,...,...,...
735,3,28.5,16.1000,1
728,2,25.0,26.0000,1
516,2,34.0,10.5000,0
862,1,48.0,25.9292,0


In [24]:
logit1 = LogisticRegression(random_state = seed)

In [25]:
features = ["age", "pclass", "fare", "male"]

logit1.fit(X_train[features], y_train)

In [26]:
y_pred = logit1.predict(X_train[features])

In [27]:
print("Logistic Regression using age, pclass, fare, and sex features")
print(f'Accuracy of Logistic Regression classifier on training set: {logit1.score(X_train[features], y_train):.2%}')

Logistic Regression using age, pclass, fare, and sex features
Accuracy of Logistic Regression classifier on training set: 80.56%


## 3. Try out other combinations of features and models.

In [28]:
# class_weight balanced model with all features

logit2 = LogisticRegression(random_state = seed, class_weight = 'balanced' )

logit2.fit(X_train, y_train)

y_pred = logit2.predict(X_train)

accuracy = logit2.score(X_train, y_train)

print("All Features and we're setting the class_weight hyperparameter")
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2%}')

All Features and we're setting the class_weight hyperparameter
Accuracy of Logistic Regression classifier on training set: 80.96%


In [29]:
# Only Pclass model

features = ["pclass"]

logit3 = LogisticRegression(random_state = seed)

logit3.fit(X_train[features], y_train)

y_pred = logit3.predict(X_train[features])

accuracy = logit3.score(X_train[features], y_train)

print('Logical Regression using only pclass feature')
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2%}')

Logical Regression using only pclass feature
Accuracy of Logistic Regression classifier on training set: 67.74%


In [30]:
# Only age model

features = ['age']

logit4 = LogisticRegression(random_state = seed)

logit4.fit(X_train[features], y_train)

y_pred = logit4.predict(X_train[features])

accuracy = logit4.score(X_train[features], y_train)

print('Logical Regression using only pclass feature')
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2%}')

Logical Regression using only pclass feature
Accuracy of Logistic Regression classifier on training set: 59.32%


In [34]:
# All but sex feature model with l1 penalty, solver: liblinear, and max_iter: 200

features = ["age", "pclass", "fare"]

logit5 = LogisticRegression(penalty = 'l1', random_state = seed,
                            solver = 'liblinear', max_iter = 200)

logit5.fit(X_train[features], y_train)

y_pred = logit5.predict(X_train[features])

accuracy = logit5.score(X_train[features], y_train)

print('Logical Regression using only pclass feature')
print(f'Accuracy of Logistic Regression classifier on training set: {accuracy:.2%}')

Logical Regression using only pclass feature
Accuracy of Logistic Regression classifier on training set: 71.54%


## 4. Use you best 3 models to predict and evaluate on your validate sample.

In [45]:
# class_weight balanced model with all features on validate dataset

y_pred = logit2.predict(X_validate)

val_acc = logit2.score(X_validate, y_validate)

print('Logit2 model using age, pclass, fare, and sex as the features')
print(f'Accuracy of Logistic Regression classifier on validate set: {val_acc:.2%}')
print()
print(classification_report(y_validate, y_pred))

Logit2 model using age, pclass, fare, and sex as the features
Accuracy of Logistic Regression classifier on validate set: 79.92%

              precision    recall  f1-score   support

           0       0.84      0.82      0.83       148
           1       0.74      0.77      0.76       101

    accuracy                           0.80       249
   macro avg       0.79      0.79      0.79       249
weighted avg       0.80      0.80      0.80       249



In [48]:
# All feature without being balanced on validate dataset

y_pred = logit1.predict(X_validate)

val_acc = logit1.score(X_validate, y_validate)

print('Logit1 model using age, pclass, fare, and sex as the features')
print(f'Accuracy of Logistic Regression classifier on validate set: {val_acc:.2%}')

print()
print(classification_report(y_validate, y_pred))

Logit1 model using age, pclass, fare, and sex as the features
Accuracy of Logistic Regression classifier on validate set: 59.84%

              precision    recall  f1-score   support

           0       0.60      0.97      0.74       148
           1       0.56      0.05      0.09       101

    accuracy                           0.60       249
   macro avg       0.58      0.51      0.42       249
weighted avg       0.58      0.60      0.48       249



In [47]:
# All but sex feature model with l1 penalty, solver: liblinear, and max_iter: 200 on validate dataset

features = ["age", "pclass", "fare"]

y_pred = logit5.predict(X_validate[features])

val_acc = logit5.score(X_validate[features], y_validate)

print('Logit5 model using age, pclass, and fare as the features')
print(f'Accuracy of Logistic Regression classifier on validate set: {val_acc:.2%}')
print()
print(classification_report(y_validate, y_pred))

Logit5 model using age, pclass, and fare as the features
Accuracy of Logistic Regression classifier on validate set: 72.29%

              precision    recall  f1-score   support

           0       0.73      0.85      0.79       148
           1       0.71      0.53      0.61       101

    accuracy                           0.72       249
   macro avg       0.72      0.69      0.70       249
weighted avg       0.72      0.72      0.71       249



## 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [49]:
# class_weight balanced model with all features on test dataset

y_pred = logit2.predict(X_test)

test_acc = logit2.score(X_test, y_test)

print('Logit2 model using age, pclass, fare, and sex as the features')
print(f'Accuracy of Logistic Regression classifier on validate set: {test_acc:.2%}')
print()
print(classification_report(y_test, y_pred))

Logit2 model using age, pclass, fare, and sex as the features
Accuracy of Logistic Regression classifier on validate set: 82.00%

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       148
           1       0.76      0.82      0.79       102

    accuracy                           0.82       250
   macro avg       0.81      0.82      0.82       250
weighted avg       0.82      0.82      0.82       250



In [51]:
#test dataset returned with 82% accuracy returning higher than both train and validate dataset.