In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from acquire import get_titanic_data
from prepare import prep_titanic

In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

For all of the models you create, choose a threshold that optimizes for accuracy.

Do your work for these exercises in either a notebook or a python script named model within your classification directory.



9. Bonus Bonus: how does scaling the data interact with your choice of C?

In [2]:
df = get_titanic_data()
scaler, encoder, train, test = prep_titanic(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [3]:
train.shape, test.shape

((712, 14), (179, 14))

In [4]:
train, validate = train_test_split(train, train_size=.75, random_state=123)

In [5]:
train.shape, test.shape, validate.shape

((534, 14), (179, 14), (178, 14))

In [6]:
train.head(1)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,alone,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,age_not_scaled,fare_not_scaled
444,444,1,3,male,0.369602,0,0,0.015835,1,0.0,0.0,1.0,29.832908,8.1125


1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?


In [7]:
X_train = train[['age_not_scaled', 'fare_not_scaled', 'pclass']]
y_train = train.survived

In [8]:
pred = pd.DataFrame({'actual': y_train})
pred.head()

Unnamed: 0,actual
444,1
30,0
137,0
692,1
190,1


In [9]:
X_train = train[['fare_not_scaled', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123)
model.fit(X_train, y_train)
pred['pred_mini'] = model.predict(X_train)
pred.head()



Unnamed: 0,actual,pred_mini
444,1,0
30,0,1
137,0,1
692,1,0
190,1,0


In [10]:
X_train = train[['age_not_scaled', 'fare_not_scaled', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123)
model.fit(X_train, y_train)
pred['pred_1'] = model.predict(X_train)
pred.head()



Unnamed: 0,actual,pred_mini,pred_1
444,1,0,0
30,0,1,1
137,0,1,1
692,1,0,0
190,1,0,0


In [11]:
accuracy_score(pred.actual, pred.pred_1), precision_score(pred.actual, pred.pred_1), recall_score(pred.actual, pred.pred_1)

(0.6985018726591761, 0.6917293233082706, 0.4339622641509434)

In [12]:
accuracy_score(pred.actual, pred.pred_mini), precision_score(pred.actual, pred.pred_mini), recall_score(pred.actual, pred.pred_mini)

(0.6797752808988764, 0.6496350364963503, 0.419811320754717)

2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.

In [13]:
train['encode_sex'] = LabelEncoder().fit_transform(train.sex)
validate['encode_sex'] = LabelEncoder().fit_transform(validate.sex)
test['encode_sex'] = LabelEncoder().fit_transform(test.sex)


In [14]:
X_train = train[['age_not_scaled', 'fare_not_scaled', 'pclass', 'encode_sex']]
y_train = train.survived
model = LogisticRegression(random_state=123)
model.fit(X_train, y_train)
pred['pred_2'] = model.predict(X_train)
pred.head()



Unnamed: 0,actual,pred_mini,pred_1,pred_2
444,1,0,0,0
30,0,1,1,0
137,0,1,1,0
692,1,0,0,0
190,1,0,0,1


In [15]:
print('Age, Sex, pclass, Fare')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_2)}')
print(f'precision: {precision_score(pred.actual, pred.pred_2)}')
print(f'recall: {recall_score(pred.actual, pred.pred_2)}')

Age, Sex, pclass, Fare
accuracy: 0.7696629213483146
precision: 0.7236180904522613
recall: 0.6792452830188679


3. Try out other combinations of features and models.

In [16]:
X_train = train[['age_not_scaled', 'fare', 'encode_sex', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123)
model.fit(X_train, y_train)
pred['pred_3'] = model.predict(X_train)
pred.head()



Unnamed: 0,actual,pred_mini,pred_1,pred_2,pred_3
444,1,0,0,0,0
30,0,1,1,0,0
137,0,1,1,0,0
692,1,0,0,0,0
190,1,0,0,1,1


In [17]:
print('Age, Sex, pclass, Fare scaled')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_3)}')
print(f'precision: {precision_score(pred.actual, pred.pred_3)}')
print(f'recall: {recall_score(pred.actual, pred.pred_3)}')

Age, Sex, pclass, Fare scaled
accuracy: 0.7771535580524345
precision: 0.7409326424870466
recall: 0.6745283018867925


In [18]:
X_train = train[['age_not_scaled', 'alone', 'encode_sex', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123)
model.fit(X_train, y_train)
pred['pred_4'] = model.predict(X_train)
pred.head()



Unnamed: 0,actual,pred_mini,pred_1,pred_2,pred_3,pred_4
444,1,0,0,0,0,0
30,0,1,1,0,0,0
137,0,1,1,0,0,0
692,1,0,0,0,0,0
190,1,0,0,1,1,1


In [19]:
print('Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_4)}')
print(f'precision: {precision_score(pred.actual, pred.pred_4)}')
print(f'recall: {recall_score(pred.actual, pred.pred_4)}')

Age, Sex, pclass, Is alone
accuracy: 0.7790262172284644
precision: 0.7447916666666666
recall: 0.6745283018867925


4. Choose you best model and evaluate it on the test dataset. Is it overfit?

In [20]:
X_train = validate[['age_not_scaled', 'alone', 'encode_sex', 'pclass']]
y_train = validate.survived
model = LogisticRegression(random_state=123)
model.fit(X_train, y_train)
validate['validate_4'] = model.predict(X_train)



In [21]:
print('Validated:  Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(validate.survived, validate.validate_4)}')
print(f'precision: {precision_score(validate.survived, validate.validate_4)}')
print(f'recall: {recall_score(validate.survived, validate.validate_4)}')

Validated:  Age, Sex, pclass, Is alone
accuracy: 0.8202247191011236
precision: 0.7796610169491526
recall: 0.7076923076923077


5. Bonus How do different strategies for handling the missing values in the age column affect model performance?

In [22]:
df = get_titanic_data()

In [23]:
train1, test = train_test_split(df, random_state=123, train_size=.8)
train1, validate = train_test_split(train1, random_state=123, train_size=.8)
train1.shape, test.shape, validate.shape

((569, 13), (179, 13), (143, 13))

In [24]:
train1.drop(columns={'deck'}, inplace=True)
train_drop = train1.dropna()

In [25]:
train_drop['encode_sex'] = LabelEncoder().fit_transform(train_drop.sex)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
X_train = train_drop[['age', 'alone', 'encode_sex', 'pclass']]
y_train = train_drop.survived
model = LogisticRegression(random_state=123)
model.fit(X_train, y_train)
train_drop['predict_drop'] = model.predict(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [27]:
print('Dropped Values:  Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(train_drop.survived, train_drop.predict_drop)}')
print(f'precision: {precision_score(train_drop.survived, train_drop.predict_drop)}')
print(f'recall: {recall_score(train_drop.survived, train_drop.predict_drop)}')

Dropped Values:  Age, Sex, pclass, Is alone
accuracy: 0.7871396895787139
precision: 0.7784431137724551
recall: 0.6878306878306878


In [28]:
train.age.value_counts()

0.369602    110
0.371701     18
0.271174     18
0.346569     17
0.233476     16
           ... 
0.566474      1
0.723549      1
0.428248      1
0.811510      1
1.000000      1
Name: age, Length: 79, dtype: int64

In [29]:
train.age.median(), train.age.mean()

(0.36960175674061874, 0.37141810653298873)

In [30]:
train.age.mode()

0    0.369602
dtype: float64

6. Bonus: How do different strategies for encoding sex affect model performance?

7. Bonus: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty. Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

> C = .01, .1, 1, 10, 100, 1000

In [31]:
X_train = train[['age_not_scaled', 'alone', 'encode_sex', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123, C=.01)
model.fit(X_train, y_train)
pred['pred_4_01'] = model.predict(X_train)
pred.head()



Unnamed: 0,actual,pred_mini,pred_1,pred_2,pred_3,pred_4,pred_4_01
444,1,0,0,0,0,0,0
30,0,1,1,0,0,0,0
137,0,1,1,0,0,0,0
692,1,0,0,0,0,0,0
190,1,0,0,1,1,1,0


In [32]:
print('C=.01 : Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_4_01)}')
print(f'precision: {precision_score(pred.actual, pred.pred_4_01)}')
print(f'recall: {recall_score(pred.actual, pred.pred_4_01)}')

C=.01 : Age, Sex, pclass, Is alone
accuracy: 0.702247191011236
precision: 0.9818181818181818
recall: 0.25471698113207547


In [36]:
X_train = train[['age_not_scaled', 'alone', 'encode_sex', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123, C=.1)
model.fit(X_train, y_train)
pred['pred_4_dot1'] = model.predict(X_train)
pred.head()



Unnamed: 0,actual,pred_mini,pred_1,pred_2,pred_3,pred_4,pred_4_01,pred_4_.1,pred_4_dot1
444,1,0,0,0,0,0,0,0,0
30,0,1,1,0,0,0,0,0,0
137,0,1,1,0,0,0,0,0,0
692,1,0,0,0,0,0,0,0,0
190,1,0,0,1,1,1,0,1,1


In [37]:
print('C=.1 : Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_4_dot1)}')
print(f'precision: {precision_score(pred.actual, pred.pred_4_dot1)}')
print(f'recall: {recall_score(pred.actual, pred.pred_4_dot1)}')

C=.1 : Age, Sex, pclass, Is alone
accuracy: 0.7715355805243446
precision: 0.78125
recall: 0.589622641509434


In [39]:
X_train = train[['age_not_scaled', 'alone', 'encode_sex', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123, C=10)
model.fit(X_train, y_train)
pred['pred_4_10'] = model.predict(X_train)
pred.head()



Unnamed: 0,actual,pred_mini,pred_1,pred_2,pred_3,pred_4,pred_4_01,pred_4_.1,pred_4_dot1,pred_4_1,pred_4_10
444,1,0,0,0,0,0,0,0,0,0,0
30,0,1,1,0,0,0,0,0,0,0,0
137,0,1,1,0,0,0,0,0,0,0,0
692,1,0,0,0,0,0,0,0,0,0,0
190,1,0,0,1,1,1,0,1,1,1,1


In [41]:
print('Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_4)}')
print(f'precision: {precision_score(pred.actual, pred.pred_4)}')
print(f'recall: {recall_score(pred.actual, pred.pred_4)}')

Age, Sex, pclass, Is alone
accuracy: 0.7790262172284644
precision: 0.7447916666666666
recall: 0.6745283018867925


In [40]:
print('C=10 : Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_4_10)}')
print(f'precision: {precision_score(pred.actual, pred.pred_4_10)}')
print(f'recall: {recall_score(pred.actual, pred.pred_4_10)}')

C=10 : Age, Sex, pclass, Is alone
accuracy: 0.7790262172284644
precision: 0.735
recall: 0.6933962264150944


In [43]:
X_train = train[['age_not_scaled', 'alone', 'encode_sex', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123, C=100)
model.fit(X_train, y_train)
pred['pred_4_100'] = model.predict(X_train)



In [44]:
print('C=100 : Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_4_100)}')
print(f'precision: {precision_score(pred.actual, pred.pred_4_100)}')
print(f'recall: {recall_score(pred.actual, pred.pred_4_100)}')

C=100 : Age, Sex, pclass, Is alone
accuracy: 0.7846441947565543
precision: 0.7412935323383084
recall: 0.7028301886792453


In [45]:
X_train = train[['age_not_scaled', 'alone', 'encode_sex', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123, C=1000)
model.fit(X_train, y_train)
pred['pred_4_1000'] = model.predict(X_train)



In [47]:
print('C=1000 : Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_4_1000)}')
print(f'precision: {precision_score(pred.actual, pred.pred_4_1000)}')
print(f'recall: {recall_score(pred.actual, pred.pred_4_1000)}')

C=1000 : Age, Sex, pclass, Is alone
accuracy: 0.7846441947565543
precision: 0.7412935323383084
recall: 0.7028301886792453


In [48]:
X_train = train[['age_not_scaled', 'alone', 'encode_sex', 'pclass']]
y_train = train.survived
model = LogisticRegression(random_state=123, C=.000001)
model.fit(X_train, y_train)
pred['pred_4_000001'] = model.predict(X_train)




In [49]:
print('C=.00001 : Age, Sex, pclass, Is alone')
print(f'accuracy: {accuracy_score(pred.actual, pred.pred_4_000001)}')
print(f'precision: {precision_score(pred.actual, pred.pred_4_000001)}')
print(f'recall: {recall_score(pred.actual, pred.pred_4_000001)}')

C=.00001 : Age, Sex, pclass, Is alone
accuracy: 0.602996254681648
precision: 0.0
recall: 0.0


  'precision', 'predicted', average, warn_for)
