In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

### Data cleanup

In [3]:
df_median_age = df_train.groupby(['Sex', 'Pclass'])['Age'].median().reset_index()
df_median_age = df_median_age.rename(columns={'Age' : 'Median_Age'})

df_train = df_train.merge(df_median_age, on=['Sex', 'Pclass'], how='left')
df_train.loc[df_train['Age'].isna(), 'Age'] = df_train['Median_Age']
df_train = df_train.drop(columns = 'Median_Age')

df_test = df_test.merge(df_median_age, on=['Sex', 'Pclass'], how='left')
df_test.loc[df_test['Age'].isna(), 'Age'] = df_test['Median_Age']
df_test = df_test.drop(columns = 'Median_Age')

In [4]:
df_train['Cabin_Type'] = df_train.loc[df_train['Cabin'].notna()]['Cabin'].apply(lambda x: x[0])
df_test['Cabin_Type'] = df_test.loc[df_test['Cabin'].notna()]['Cabin'].apply(lambda x: x[0])

df_train['Cabin_Nos'] = df_train.loc[df_train['Cabin'].notna()]['Cabin'].apply(lambda x: len(x.split()))
df_test['Cabin_Nos'] = df_test.loc[df_test['Cabin'].notna()]['Cabin'].apply(lambda x: len(x.split()))

df_train = df_train.drop(columns='Cabin')
df_test = df_test.drop(columns='Cabin')

In [5]:
new_df_train = pd.get_dummies(df_train['Sex'])
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Sex'])
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Sex'])
df_test = df_test.drop(columns=['Sex'])

In [6]:
df_train['Embarked'] = df_train['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')

new_df_train = pd.get_dummies(df_train['Embarked'])
new_cols = ['Embarked_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Embarked'])
new_cols = ['Embarked_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Embarked'])
df_test = df_test.drop(columns=['Embarked'])

In [7]:
new_df_train = pd.get_dummies(df_train['Cabin_Type'])
new_cols = ['Cabin_Type_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Cabin_Type'])
new_cols = ['Cabin_Type_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Cabin_Type'])
df_test = df_test.drop(columns=['Cabin_Type'])

In [8]:
class_fare_dict = df_train.groupby('Pclass')['Fare'].median().to_dict()

df_train['Pclass_Fare_median'] = df_train['Pclass'].apply(lambda x: class_fare_dict[x])
df_test['Pclass_Fare_median'] = df_test['Pclass'].apply(lambda x: class_fare_dict[x])

df_train.loc[df_train['Fare'].isna(), 'Fare'] = df_train['Pclass_Fare_median']
df_test.loc[df_test['Fare'].isna(), 'Fare'] = df_test['Pclass_Fare_median']

df_train = df_train.drop(columns = 'Pclass_Fare_median')
df_test = df_test.drop(columns = 'Pclass_Fare_median')

In [9]:
new_df_train = pd.get_dummies(df_train['Pclass'])
new_cols = ['Pclass_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Pclass'])
new_cols = ['Pclass_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Pclass'])
df_test = df_test.drop(columns=['Pclass'])

In [10]:
df_train['Cabin_Nos'] = df_train['Cabin_Nos'].fillna(1)
df_test['Cabin_Nos'] = df_test['Cabin_Nos'].fillna(1)

In [11]:
df_test['Cabin_Type_T'] = 0

In [51]:
inp_col_list = list(df_train.columns.values)
inp_col_list.remove('PassengerId')
inp_col_list.remove('Survived')
inp_col_list.remove('Name')
inp_col_list.remove('Ticket')

inp_col_list.remove('Pclass_1')
inp_col_list.remove('Pclass_2')
inp_col_list.remove('Pclass_3')

inp_col_list.remove('Cabin_Type_A')
inp_col_list.remove('Cabin_Type_B')
inp_col_list.remove('Cabin_Type_C')
inp_col_list.remove('Cabin_Type_D')
inp_col_list.remove('Cabin_Type_E')
inp_col_list.remove('Cabin_Type_F')
inp_col_list.remove('Cabin_Type_G')
inp_col_list.remove('Cabin_Type_T')

inp_col_list.remove('Embarked_C')
inp_col_list.remove('Embarked_Q')
inp_col_list.remove('Embarked_S')

inp_col_list.remove('Parch')

inp_col_list.remove('Cabin_Nos')

In [52]:
X = df_train[inp_col_list]
Y = df_train['Survived']

x = X.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X_scaled = pd.DataFrame(x_scaled)

x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

In [53]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

LogisticRegression()

In [54]:
predictions = logreg.predict(x_test)
score = logreg.score(x_test, y_test)
print(score)

0.7877094972067039


In [57]:
logreg.coef_

array([[-0.81489491, -1.9909154 ,  2.4980033 ,  1.23834088, -1.23831256]])

In [55]:
log_reg = sm.Logit(y_train, x_train).fit()

Optimization terminated successfully.
         Current function value: 0.482049
         Iterations 6


In [56]:
log_reg.summary()

0,1,2,3
Dep. Variable:,Survived,No. Observations:,712.0
Model:,Logit,Df Residuals:,707.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 18 May 2022",Pseudo R-squ.:,0.2759
Time:,12:06:07,Log-Likelihood:,-343.22
converged:,True,LL-Null:,-473.99
Covariance Type:,nonrobust,LLR p-value:,2.1210000000000002e-55

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
0.0,-1.7628,0.623,-2.830,0.005,-2.984,-0.542
1.0,-3.9220,0.898,-4.365,0.000,-5.683,-2.161
2.0,6.7434,1.482,4.552,0.000,3.840,9.647
3.0,1.5712,0.282,5.564,0.000,1.018,2.125
4.0,-0.9847,0.257,-3.835,0.000,-1.488,-0.481


In [50]:
X

Unnamed: 0,Age,SibSp,Fare,Cabin_Nos,female,male
0,22.0,1,7.2500,1.0,0,1
1,38.0,1,71.2833,1.0,1,0
2,26.0,0,7.9250,1.0,1,0
3,35.0,1,53.1000,1.0,1,0
4,35.0,0,8.0500,1.0,0,1
...,...,...,...,...,...,...
886,27.0,0,13.0000,1.0,0,1
887,19.0,0,30.0000,1.0,1,0
888,21.5,1,23.4500,1.0,1,0
889,26.0,0,30.0000,1.0,0,1


In [193]:
X = df_test[inp_col_list]
x = X.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X_scaled = pd.DataFrame(x_scaled)

In [194]:
predictions = logreg.predict(X_scaled)

In [195]:
df_test['Survived'] = predictions

In [196]:
df_test[['PassengerId', 'Survived']].to_csv('../submissions/logistic_regression_submission.csv', index=False)