In [10]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression

In [6]:
import os

In [7]:
def new_titanic_data():
    sql_query = 'SELECT * FROM passengers'
    df = pd.read_sql(sql_query, get_connection('titanic_db'))
    df.to_csv('titanic_df.csv')
    return df

In [8]:
def get_titanic_data(cached=False):
    if cached or os.path.isfile('titanic_df.csv') == False:
        df = new_titanic_data()
    else:
        df = pd.read_csv('titanic_df.csv', index_col=0)
    return df

In [20]:
titanic = get_titanic_data()

In [21]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [18]:
def titanic_split(df):

    train_validate, test = train_test_split(df, test_size=.2,
                                        random_state=123,
                                        stratify=df.survived)
    train, validate = train_test_split(train_validate, test_size=.3,
                                        random_state=123,
                                        stratify=train_validate.survived)
    return train, validate, test

def impute_mean_age(train, validate, test):

    imputer = SimpleImputer(strategy = 'mean')

    train['age'] = imputer.fit_transform(train[['age']])

    validate['age'] = imputer.transform(validate[['age']])

    test['age'] = imputer.transform(test[['age']])

    return train, validate, test

def prep_titanic_data(cached=True):
#     df = get_titanic_data(cached)
    df = df[~df.embarked.isnull()]
    titanic_dummies = pd.get_dummies(df.embarked, drop_first=True)
    df = pd.concat([df, titanic_dummies], axis=1)
    df = df.drop(columns='deck')

    train, validate, test = titanic_split(df)

    train, validate, test = impute_mean_age(train, validate, test)

    return train, validate, test

In [22]:
titanic = prep_titanic_data(titanic)



UnboundLocalError: local variable 'df' referenced before assignment

In [23]:
 titanic = titanic[~titanic.embarked.isnull()]

In [35]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Q,S
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,0,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,0,1
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,0,1
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,0,1


In [37]:
titanic_dummies = pd.get_dummies(titanic[['sex', 'embark_town', 'class']], drop_first=True)


In [61]:
titanic_dummies = pd.get_dummies(titanic['pclass'], drop_first=True)


In [62]:
titanic = pd.concat([titanic, titanic_dummies], axis=1)


In [47]:
titanic = titanic.drop(columns=['embark_town','class'])


In [54]:
titanic = titanic.drop(columns='sex')

In [69]:
titanic = titanic.drop(columns='pclass')

In [70]:
train, validate, test = titanic_split(titanic)


In [71]:
train, validate, test = impute_mean_age(train, validate, test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [72]:
train.shape

(497, 13)

In [73]:
validate.shape

(214, 13)

In [74]:
test.shape

(178, 13)

In [75]:
test.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,class_Second,class_Third,2,3
561,0,40.0,0,0,7.8958,1,1,0,1,0,1,0,1
328,1,31.0,1,1,20.525,0,0,0,1,0,1,0,1
643,1,29.916875,0,0,56.4958,1,1,0,1,0,1,0,1
498,0,25.0,1,2,151.55,0,0,0,1,0,0,0,0
875,1,15.0,0,0,7.225,1,0,0,0,0,1,0,1


In [76]:
train.survived.value_counts(normalize=True)


0    0.617706
1    0.382294
Name: survived, dtype: float64

In [77]:
logit = LogisticRegression()


In [78]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [79]:
logit = logit.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [80]:
X_train.columns


Index([                    'age',                   'sibsp',
                         'parch',                    'fare',
                         'alone',                'sex_male',
        'embark_town_Queenstown', 'embark_town_Southampton',
                  'class_Second',             'class_Third',
                               2,                         3],
      dtype='object')

In [81]:
print(logit.coef_)


[[-3.00263702e-02 -5.54052471e-01 -2.42205064e-01  1.85202548e-03
  -9.80949835e-01 -2.43119347e+00  6.92273924e-01  1.02410521e-01
  -4.37990618e-01 -1.02302377e+00 -4.37990618e-01 -1.02302377e+00]]


In [82]:
y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

In [83]:
y_pred_proba

array([[0.62395644, 0.37604356],
       [0.12461106, 0.87538894],
       [0.95510622, 0.04489378],
       [0.10794509, 0.89205491],
       [0.06114476, 0.93885524],
       [0.69107827, 0.30892173],
       [0.79102839, 0.20897161],
       [0.74924724, 0.25075276],
       [0.85036343, 0.14963657],
       [0.97628274, 0.02371726],
       [0.42782999, 0.57217001],
       [0.75946327, 0.24053673],
       [0.08793068, 0.91206932],
       [0.67056584, 0.32943416],
       [0.74221537, 0.25778463],
       [0.85117886, 0.14882114],
       [0.91106935, 0.08893065],
       [0.33365248, 0.66634752],
       [0.78076496, 0.21923504],
       [0.33654412, 0.66345588],
       [0.31443044, 0.68556956],
       [0.9111106 , 0.0888894 ],
       [0.13819651, 0.86180349],
       [0.24757618, 0.75242382],
       [0.66804223, 0.33195777],
       [0.89277059, 0.10722941],
       [0.10738896, 0.89261104],
       [0.18492761, 0.81507239],
       [0.56158807, 0.43841193],
       [0.74268006, 0.25731994],
       [0.

In [84]:
logit.score(X_train, y_train)


0.8048289738430584

In [85]:
print(confusion_matrix(y_train, y_pred))


[[268  39]
 [ 58 132]]


In [86]:
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.87      0.85       307
           1       0.77      0.69      0.73       190

    accuracy                           0.80       497
   macro avg       0.80      0.78      0.79       497
weighted avg       0.80      0.80      0.80       497



In [87]:
titanic2 = get_titanic_data()

In [88]:
titanic2.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [93]:
titanic2.age.isnull().values.any()

True