In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import acquire
import prepare

# Exercises



#### For all of the models you create, choose a threshold that optimizes for accuracy.

In [2]:
df = acquire.get_titanic_data()

Found CSV


In [3]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
df = prepare.prep_titanic(df)

In [5]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
0,0,0,3,male,22.0,1,0,7.25,S,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,S,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,1,1,0,1


In [6]:
df.shape

(891, 13)

In [7]:
df.age.isnull().value_counts()

False    714
True     177
Name: age, dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.shape

(714, 13)

In [10]:
train, validate, test = prepare.split_function(df, 'survived')

In [11]:
print(f'Train: {train.shape}')
print(f'Validate: {validate.shape}')
print(f'Test: {test.shape}')

Train: (428, 13)
Validate: (143, 13)
Test: (143, 13)


In [12]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
548,548,0,3,male,33.0,1,1,20.525,S,0,1,0,1
133,133,1,2,female,29.0,1,0,26.0,S,0,0,0,1
540,540,1,1,female,36.0,0,2,71.0,S,0,0,0,1
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
649,649,1,3,female,23.0,0,0,7.55,S,1,0,0,1


In [13]:
train.age.isnull().value_counts()

False    428
Name: age, dtype: int64

In [14]:
def xy_train_val(train, validate, test, target_variable, drop_cols):
    """
    input train, validate, test, after using split function()
    input target_variable as string
    drop_cols formatted as: ['col1', 'col2', 'etc'] for multiple columns
        This function will drop all 'object' columns. Identify additional 
        columns you want to drop and insert 1 column as a string or multiple
        columns in a list of strings.
    X_train, X_validate, X_test, y_train, y_validate, y_test
    """
    
    baseline_accuracy = train[target_variable].value_counts().max() / train[target_variable].value_counts().sum()
    print(f'Baseline Accuracy: {baseline_accuracy:.2%}')
    
    X_train = train.select_dtypes(exclude=['object']).drop(columns=[target_variable]).drop(columns=drop_cols)
    X_validate = validate.select_dtypes(exclude=['object']).drop(columns=[target_variable]).drop(columns=drop_cols)
    X_test = test.select_dtypes(exclude=['object']).drop(columns=[target_variable]).drop(columns=drop_cols)
    
    y_train = train[target_variable]
    y_validate = validate[target_variable]
    y_test = test[target_variable]
    
    return X_train, X_validate, X_test, y_train, y_validate, y_test

In [15]:
X_train, X_validate, X_test, y_train, y_validate, y_test = xy_train_val(train, validate, test, 'survived', 'passenger_id')

Baseline Accuracy: 59.35%


In [16]:
X_train.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
548,3,33.0,1,1,20.525,0,1,0,1
133,2,29.0,1,0,26.0,0,0,0,1
540,1,36.0,0,2,71.0,0,0,0,1
2,3,26.0,0,0,7.925,1,0,0,1
649,3,23.0,0,0,7.55,1,0,0,1


In [17]:
y_train[:5]

548    0
133    1
540    1
2      1
649    1
Name: survived, dtype: int64

In [18]:
logit1 = LogisticRegression()
logit1

In [19]:
logit1.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
logit1.score(X_train, y_train)

0.8060747663551402

In [22]:
y_pred = logit1.predict(X_train)

In [23]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       254
           1       0.80      0.70      0.75       174

    accuracy                           0.81       428
   macro avg       0.80      0.79      0.79       428
weighted avg       0.81      0.81      0.80       428



In [24]:
logit1.coef_

array([[-1.06382834e+00, -3.57903643e-02, -6.54270243e-01,
        -7.10783167e-02,  8.41398501e-04, -4.24365002e-01,
        -2.20880152e+00, -9.92952897e-02, -1.43275929e-01]])

In [25]:
X_train.columns

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'alone', 'sex_male',
       'embarked_Q', 'embarked_S'],
      dtype='object')

#### 1. Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?



In [None]:
features1 = ['age', 'fare', 'pclass']

In [26]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
548,548,0,3,male,33.0,1,1,20.525,S,0,1,0,1
133,133,1,2,female,29.0,1,0,26.0,S,0,0,0,1
540,540,1,1,female,36.0,0,2,71.0,S,0,0,0,1
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
649,649,1,3,female,23.0,0,0,7.55,S,1,0,0,1


In [59]:
X_train2, X_validate2, X_test2, y_train2, y_validate2, y_test2 = xy_train_val(train, validate, test, 'survived', ['passenger_id', 'sibsp', 'parch', 'alone',
                                                                                                            'sex_male', 'embarked_Q', 'embarked_S'])

Baseline Accuracy: 59.35%


In [60]:
X_train2.head()

Unnamed: 0,pclass,age,fare
548,3,33.0,20.525
133,2,29.0,26.0
540,1,36.0,71.0
2,3,26.0,7.925
649,3,23.0,7.55


In [29]:
logit2 = LogisticRegression()
logit2

In [61]:
logit2.fit(X_train2, y_train2)

In [62]:
logit2.score(X_train2, y_train2)

0.6985981308411215

In [63]:
y_pred2 = logit2.predict(X_train2)

In [64]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.82      0.76       254
           1       0.66      0.52      0.59       174

    accuracy                           0.70       428
   macro avg       0.69      0.67      0.67       428
weighted avg       0.69      0.70      0.69       428



In [78]:
logit2 = logit_run(X_train2, y_train2, X_validate2, y_validate2)

Model train score: 69.86%
Confusion Matrix:
[[208  46]
 [ 83  91]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.82      0.76       254
           1       0.66      0.52      0.59       174

    accuracy                           0.70       428
   macro avg       0.69      0.67      0.67       428
weighted avg       0.69      0.70      0.69       428

Model validate score: 69.23%


#### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.



In [34]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
548,548,0,3,male,33.0,1,1,20.525,S,0,1,0,1
133,133,1,2,female,29.0,1,0,26.0,S,0,0,0,1
540,540,1,1,female,36.0,0,2,71.0,S,0,0,0,1
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
649,649,1,3,female,23.0,0,0,7.55,S,1,0,0,1


In [79]:
X_train3, X_validate3, X_test3, y_train3, y_validate3, y_test3 = xy_train_val(train, validate, test, 'survived', ['passenger_id', 'sibsp', 'parch', 'alone',
                                                                                                            'embarked_Q', 'embarked_S'])

Baseline Accuracy: 59.35%


In [80]:
X_train3.head()

Unnamed: 0,pclass,age,fare,sex_male
548,3,33.0,20.525,1
133,2,29.0,26.0,0
540,1,36.0,71.0,0
2,3,26.0,7.925,0
649,3,23.0,7.55,0


In [81]:
logit3 = LogisticRegression()
logit3

In [82]:
logit3.fit(X_train3, y_train3)

In [83]:
logit3.score(X_train3, y_train3)

0.7873831775700935

In [84]:
logit3 = logit_run(X_train3, y_train3, X_validate3, y_validate3)

Model train score: 78.74%
Confusion Matrix:
[[215  39]
 [ 52 122]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       254
           1       0.76      0.70      0.73       174

    accuracy                           0.79       428
   macro avg       0.78      0.77      0.78       428
weighted avg       0.79      0.79      0.79       428

Model validate score: 76.92%


#### 3. Try out other combinations of features and models.



In [41]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
548,548,0,3,male,33.0,1,1,20.525,S,0,1,0,1
133,133,1,2,female,29.0,1,0,26.0,S,0,0,0,1
540,540,1,1,female,36.0,0,2,71.0,S,0,0,0,1
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
649,649,1,3,female,23.0,0,0,7.55,S,1,0,0,1


In [85]:
X_train4, X_validate4, X_test4, y_train4, y_validate4, y_test4 = xy_train_val(train, validate, test, 'survived', ['passenger_id',
                                                                                                            'sibsp',
                                                                                                            'parch',
                                                                                                            'embarked_Q',
                                                                                                            'embarked_S'])

Baseline Accuracy: 59.35%


In [86]:
X_train4.head()

Unnamed: 0,pclass,age,fare,alone,sex_male
548,3,33.0,20.525,0,1
133,2,29.0,26.0,0,0
540,1,36.0,71.0,0,0
2,3,26.0,7.925,1,0
649,3,23.0,7.55,1,0


In [87]:
logit4 = LogisticRegression()
logit4

In [88]:
logit4.fit(X_train4, y_train4)

In [89]:
logit4.score(X_train4, y_train4)

0.7920560747663551

In [127]:
logit4 = logit_run(X_train4, y_train4, X_validate4, y_validate4)

Model train score: 79.21%
Confusion Matrix:
[[217  37]
 [ 52 122]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       254
           1       0.77      0.70      0.73       174

    accuracy                           0.79       428
   macro avg       0.79      0.78      0.78       428
weighted avg       0.79      0.79      0.79       428

Model validate score: 79.02%


In [91]:
X_train5, X_validate5, X_test5, y_train5, y_validate5, y_test5 = xy_train_val(train, validate, test, 'survived', ['passenger_id',
                                                                                                            'sibsp',
                                                                                                            'alone',
                                                                                                            'embarked_Q',
                                                                                                            'embarked_S'])

Baseline Accuracy: 59.35%


In [92]:
X_train5.head()

Unnamed: 0,pclass,age,parch,fare,sex_male
548,3,33.0,1,20.525,1
133,2,29.0,0,26.0,0
540,1,36.0,2,71.0,0
2,3,26.0,0,7.925,0
649,3,23.0,0,7.55,0


In [93]:
logit5 = LogisticRegression()
logit5

In [94]:
logit5.fit(X_train5, y_train5)

In [95]:
logit5.score(X_train5, y_train5)

0.7850467289719626

In [96]:
logit5.coef_

array([[-1.09612469e+00, -2.89332868e-02, -1.31195055e-01,
         6.29387681e-05, -2.20417572e+00]])

In [98]:
logit5 = logit_run(X_train5, y_train5, X_validate5, y_validate5)

Model train score: 78.50%
Confusion Matrix:
[[215  39]
 [ 53 121]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       254
           1       0.76      0.70      0.72       174

    accuracy                           0.79       428
   macro avg       0.78      0.77      0.77       428
weighted avg       0.78      0.79      0.78       428

Model validate score: 77.62%


#### 4. Use you best 3 models to predict and evaluate on your validate sample.



In [None]:
#above

#### 5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?



In [128]:
logit4.score(X_test4, y_test4)

0.8251748251748252

#### Bonus1 How do different strategies for handling the missing values in the age column affect model performance?

In [131]:
df2 = acquire.get_titanic_data()

Found CSV


In [132]:
df2.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [133]:
df2 = prepare.prep_titanic(df2)

In [134]:
df2.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
0,0,0,3,male,22.0,1,0,7.25,S,0,1,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,1,0,0,1
3,3,1,1,female,35.0,1,0,53.1,S,0,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,1,1,0,1


In [135]:
train2, validate2, test2 = prepare.split_function(df2, 'survived')

In [137]:
train2.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
455,455,1,3,male,29.0,0,0,7.8958,C,1,1,0,0
380,380,1,1,female,42.0,0,0,227.525,C,1,0,0,0
492,492,0,1,male,55.0,0,0,30.5,S,1,1,0,1
55,55,1,1,male,,0,0,35.5,S,1,1,0,1
243,243,0,3,male,22.0,0,0,7.125,S,1,1,0,1


In [138]:
train2.age.isnull().value_counts()

False    429
True     105
Name: age, dtype: int64

In [157]:
train2[train2.survived==1].age.mean()

28.315172413793103

In [160]:
train2[train2.survived==1].age.isnull().value_counts()

False    174
True      31
Name: age, dtype: int64

In [164]:
round(31/174,2)

0.18

In [158]:
train2[train2.survived==0].age.mean()

30.129411764705882

In [161]:
train2[train2.survived==0].age.isnull().value_counts()

False    255
True      74
Name: age, dtype: int64

In [165]:
round(74/255,2)

0.29

In [177]:
X_train6, X_validate6, X_test6, y_train6, y_validate6, y_test6 = xy_train_val(train2, validate2, test2, 'survived', ['passenger_id',
                                                                                                            'sibsp',
                                                                                                            'age',
                                                                                                            'parch',
                                                                                                            'embarked_Q',
                                                                                                            'embarked_S'])

Baseline Accuracy: 61.61%


In [178]:
X_train6.head()

Unnamed: 0,pclass,fare,alone,sex_male
455,3,7.8958,1,1
380,1,227.525,1,0
492,1,30.5,1,1
55,1,35.5,1,1
243,3,7.125,1,1


In [179]:
logit6 = logit_run(X_train6, y_train6, X_validate6, y_validate6)

Model train score: 79.21%
Confusion Matrix:
[[283  46]
 [ 65 140]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.84       329
           1       0.75      0.68      0.72       205

    accuracy                           0.79       534
   macro avg       0.78      0.77      0.78       534
weighted avg       0.79      0.79      0.79       534

Model validate score: 77.53%


- if we take out age completely, it lowers our accuracy for this model

In [174]:
df2.age.mean()

29.69911764705882

In [182]:
df2 = df2.fillna(df2.age.mean())

In [183]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  891 non-null    int64  
 1   survived      891 non-null    int64  
 2   pclass        891 non-null    int64  
 3   sex           891 non-null    object 
 4   age           891 non-null    float64
 5   sibsp         891 non-null    int64  
 6   parch         891 non-null    int64  
 7   fare          891 non-null    float64
 8   embarked      891 non-null    object 
 9   alone         891 non-null    int64  
 10  sex_male      891 non-null    uint8  
 11  embarked_Q    891 non-null    uint8  
 12  embarked_S    891 non-null    uint8  
dtypes: float64(2), int64(6), object(2), uint8(3)
memory usage: 79.2+ KB


In [184]:
train3, validate3, test3 = prepare.split_function(df2, 'survived')

In [185]:
X_train7, X_validate7, X_test7, y_train7, y_validate7, y_test7 = xy_train_val(train3, validate3, test3, 'survived', ['passenger_id',
                                                                                                            'sibsp',
                                                                                                            'parch',
                                                                                                            'embarked_Q',
                                                                                                            'embarked_S'])

Baseline Accuracy: 61.61%


In [187]:
logtit7 = logit_run(X_train7, y_train7, X_validate7, y_validate7)

Model train score: 80.90%
Confusion Matrix:
[[283  46]
 [ 56 149]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       329
           1       0.76      0.73      0.74       205

    accuracy                           0.81       534
   macro avg       0.80      0.79      0.80       534
weighted avg       0.81      0.81      0.81       534

Model validate score: 78.09%


- If we sub the null values with the mean, it lowers our model score slightly. 

#### Bonus2: How do different strategies for encoding sex affect model performance?

In [100]:
def logit_run(X_train, y_train, X_validate, y_validate, x=1):
    '''
    Input X_train, y_train, X_validate, y_validate, c=x(default=1)
    return is logit model, save to variable
    '''
    logit = LogisticRegression(C=x)
    logit.fit(X_train, y_train)
    logit.score(X_train, y_train)
    print(f'Model train score: {logit.score(X_train, y_train):.02%}')
    y_pred = logit.predict(X_train)
    print('Confusion Matrix:')
    print(confusion_matrix(y_train, y_pred))
    print('Classification Report:')
    print(classification_report(y_train, y_pred))
    
    logit.score(X_validate, y_validate)
    print(f'Model validate score: {logit.score(X_validate, y_validate):.02%}')
    return logit