# Prepare data

Import data and create train/test set

In [3]:
import pandas as pd
import numpy as np
# import Titanic data
df = pd.read_csv("../data/titanicDataSet.csv")

# drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# choose only columns we may want to use in Analysis
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded']]

# choose only columns where data is complete for all features
df = df[(df['Pclass'].notnull()) & (df['Age'].notnull()) & (df['SibSp'].notnull()) & (df['Parch'].notnull()) & (df['Fare'].notnull()) & (df['Boarded'].notnull()) & (df['Sex'].notnull())]

In [4]:
# make train/test Set
df_train = df[df['Survived'].notnull()]
df_test = df[df['Survived'].isnull()]

# set features and target
X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived']
X_test = df_test.drop('Survived', axis=1)
y_test = df_test['Survived']
feature_column_names = X_train.columns

# get indices for train/test sets
index_values_train = X_train.index
index_values_test = X_test.index


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

# separate categorical and numeric data to encode categorical data
categorical_X_train = df_train[['Sex', 'Boarded']]
numeric_X_train = df_train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
categorical_X_test = df_test[['Sex', 'Boarded']]
numeric_X_test = df_test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

# encode categorical data
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(categorical_X_train)
cat_encoded_X_train = ordinal_encoder.transform(categorical_X_train)
cat_encoded_X_test = ordinal_encoder.transform(categorical_X_test)

# make categorical dataframes to join with numeric dataframes
cat_encoded_X_train = pd.DataFrame(data = cat_encoded_X_train, index = index_values_train, columns = ['Sex', 'Boarded'])
cat_encoded_X_test = pd.DataFrame(data = cat_encoded_X_test, index = index_values_test, columns = ['Sex', 'Boarded'])

# join categorical encoded data with numeric data
encoded_X_train = numeric_X_train.join(cat_encoded_X_train)
encoded_X_test = numeric_X_test.join(cat_encoded_X_test)

# encode train target values
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)

# scale X data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(encoded_X_train)
X_train_scaled = X_scaler.transform(encoded_X_train)
X_test_scaled = X_scaler.transform(encoded_X_test)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Optimize the model features

Find the features that affect the model output

In [6]:
# RFE Selection
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
model = LogisticRegression()
model.fit(X_train_scaled, encoded_y_train)
selector = RFE(model, n_features_to_select=7, step=1)
selector = selector.fit(X_train_scaled, y_train)
ss = selector.support_
selectedFeatures = list(feature_column_names[ss])

# make X of only important parameters
X_train = X_train[selectedFeatures]
X_test = X_test[selectedFeatures]

# rescale to X of only important parameters
X_scaler = MinMaxScaler().fit(encoded_X_train)
X_train_scaled = X_scaler.transform(encoded_X_train)
X_test_scaled = X_scaler.transform(encoded_X_test)

# Train the Model

Score the model without hyperparameter tuning

In [7]:
# fit model
model = LogisticRegression()
model.fit(X_train_scaled, encoded_y_train)

# model accuracy
print('Train Acc: %.3f' % model.score(X_train_scaled, encoded_y_train))

Train Acc: 0.791


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [8]:
# create the GridSearchCV model from LR model
from sklearn.model_selection import GridSearchCV
parameters = {
                'penalty':['l1', 'l2', 'elasticnet', 'none'],
                'fit_intercept':[True, False],
                'C':[1, 5, 10],
                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
            }
# train the model with GridSearch
grid = GridSearchCV(model, parameters)
grid.fit(X_train_scaled, encoded_y_train)

Traceback (most recent call last):
  File "C:\Users\Owen\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Owen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Owen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\Owen\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Owen\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\U

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [1, 5, 10], 'fit_intercept': [True, False],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']})

# Train the Model

Score the model with hyperparameter tuning

In [9]:
# get parameters to optimize LR model
bestParamsDict = grid.best_params_
bestModel = LogisticRegression(penalty=bestParamsDict['penalty'], fit_intercept=bestParamsDict['fit_intercept'], C=bestParamsDict['C'], solver=bestParamsDict['solver'])
bestModel.fit(X_train_scaled, encoded_y_train)

# Model Accuracy
print('Train Acc: %.3f' % bestModel.score(X_train_scaled, encoded_y_train))

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_true = encoded_y_train, y_pred = bestModel.predict(X_train_scaled), target_names=['Survived', 'Deceased']))

Train Acc: 0.801
              precision    recall  f1-score   support

    Survived       0.82      0.86      0.84       424
    Deceased       0.78      0.72      0.75       290

    accuracy                           0.80       714
   macro avg       0.80      0.79      0.79       714
weighted avg       0.80      0.80      0.80       714



# Predict output for test data

In [10]:
# make dataframes with indices associated with numeric data
encoded_X_train = pd.DataFrame(data = X_train_scaled, index = index_values_train, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded'])
encoded_X_test = pd.DataFrame(data = X_test_scaled, index = index_values_test, columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Boarded'])

# combine test and train set to predict all values
encoded_X = encoded_X_train.append(encoded_X_test)
encoded_X = encoded_X.sort_index(axis=0)
y_pred = bestModel.predict(encoded_X)

In [11]:
# add predicted column to original data
df['predicted_survival'] = y_pred

In [12]:
# save results to csv
df.to_csv('lr_results.csv')