In [132]:
import aquire
import prepare
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [133]:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Exercises

# In these exercises, we'll continue working with the titanic dataset and building logistic 
# regression models. Throughout this exercise, be sure you are training, evaluation, 
# and comparing models on the train and validate datasets. The test dataset should only be
# used for your final model.

# For all of the models you create, choose a threshold that optimizes for accuracy.

# Do your work for these exercises in either a notebook or a python script named model within 
# your classification-exercises repository. Add, commit, and push your work.

df = aquire.get_titanic_data()
df = df.drop(columns='deck')
df = df[~ df.age.isna()]
df = df[~ df.embarked.isna()]
df = df.drop(columns=['embarked', 'class', 'passenger_id'])
df["is_female"] = df.sex == 1
dummy_df = pd.get_dummies(df[["embark_town"]], drop_first=True)
df = pd.concat([df, dummy_df], axis=1)
df = df.drop(columns=['sex', 'embark_town'])
df.head() 

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.0,1,0,7.25,0,False,0,1
1,1,1,38.0,1,0,71.2833,0,False,0,0
2,1,3,26.0,0,0,7.925,1,False,0,1
3,1,1,35.0,1,0,53.1,0,False,0,1
4,0,3,35.0,0,0,8.05,1,False,0,1


In [134]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [181]:
# 1) Create a model that includes age in addition to fare and pclass. Does this model perform 
# better than your baseline?
train, validate, test = train_validate_test_split(df,
                                                  target = 'survived',
                                                  seed=123)

In [182]:
train.survived.value_counts().head()

0    237
1    161
Name: survived, dtype: int64

In [183]:
# Make new dataframes
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [184]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,is_female,embark_town_Queenstown,embark_town_Southampton
450,2,36.0,1,2,27.75,0,False,0,1
543,2,32.0,1,0,26.0,0,False,0,1
157,3,30.0,0,0,8.05,1,False,0,1
462,1,47.0,0,0,38.5,1,False,0,1
397,2,46.0,0,0,26.0,1,False,0,1


In [185]:
X_train.shape, y_train.shape

((398, 9), (398,))

In [186]:
# Model 1
logit = LogisticRegression(C=1, class_weight={0:1, 1:99},
                           random_state=123)

In [187]:
#  fit the model on train data
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [188]:
# now use the model to make predictions
y_pred = logit.predict(X_train)

In [189]:
# View raw probabilities (output from the model) (gives proabilities for each observation)

y_pred_proba = logit.predict_proba(X_train)
y_pred_proba = pd.DataFrame(y_pred_proba, columns = ['did_not_survive', 'survived'])
y_pred_proba

Unnamed: 0,did_not_survive,survived
0,0.008924,0.991076
1,0.012855,0.987145
2,0.055187,0.944813
3,0.007714,0.992286
4,0.026630,0.973370
...,...,...
393,0.005033,0.994967
394,0.016502,0.983498
395,0.002667,0.997333
396,0.010842,0.989158


In [180]:
print(classification_report(y_train, y_pred ))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       237
           1       0.40      1.00      0.58       161

    accuracy                           0.40       398
   macro avg       0.20      0.50      0.29       398
weighted avg       0.16      0.40      0.23       398



In [169]:
y_pred_validate = logit.predict(X_validate)

In [170]:
print('Accuracy: {:.2f}'.format(logit.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred_validate))

Accuracy: 0.40
[[  0 102]
 [  0  69]]


In [190]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)


Coefficient: 
 [[-1.27406593 -0.0338432  -0.49683216  0.25091005  0.00144343 -0.76534239
   0.         -0.12725163 -0.22433192]]
Intercept: 
 [8.6558017]


In [191]:
logit.coef_[0]

array([-1.27406593, -0.0338432 , -0.49683216,  0.25091005,  0.00144343,
       -0.76534239,  0.        , -0.12725163, -0.22433192])

In [192]:
log_coeffs = pd.DataFrame(logit.coef_[0], index = X_train.columns, columns = ['coeffs']).sort_values(by = 'coeffs', ascending = True)
log_coeffs

Unnamed: 0,coeffs
pclass,-1.274066
alone,-0.765342
sibsp,-0.496832
embark_town_Southampton,-0.224332
embark_town_Queenstown,-0.127252
age,-0.033843
is_female,0.0
fare,0.001443
parch,0.25091


In [193]:
odds = np.exp(log_coeffs)
odds

Unnamed: 0,coeffs
pclass,0.279692
alone,0.465175
sibsp,0.608455
embark_town_Southampton,0.79905
embark_town_Queenstown,0.880512
age,0.966723
is_female,1.0
fare,1.001444
parch,1.285194


In [171]:
# 2) Include sex in your model as well. Note that you'll need to encode or create a dummy variable
# of this feature before including it in a model.


In [None]:
# 3) Try out other combinations of features and models.


In [None]:
# 4) Use you best 3 models to predict and evaluate on your validate sample.


In [2]:
# 5) Choose you best model from the validation performation, and evaluate it on the test dataset.
# How do the performance metrics compare to validate? to train?


In [None]:
# 6)

In [None]:
# 7)