In [299]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [300]:
passengers = pd.read_csv('../datasets/titanic_dataset.csv')

In [301]:
passengers.head()
passengers = passengers.drop(['Cabin'], axis=1)

In [302]:
median = passengers.Age.median()
passengers.Age = passengers.Age.fillna(median)

In [303]:
mode = passengers.Embarked.mode()
passengers.Embarked = passengers.Embarked.fillna(mode)

In [304]:
passengers = passengers.drop(['PassengerId','Name', 'Ticket'], axis=1)

In [305]:
passengers.Sex = passengers.Sex.map({"male":0, "female":1})

In [306]:
passengers = passengers.drop_duplicates()

In [307]:
dummy_columns = ["Pclass", "SibSp", "Parch", "Embarked"]
passengers["Pclass"] = passengers["Pclass"].astype(object)
passengers["SibSp"] = passengers["SibSp"].astype(object)
passengers["Parch"] = passengers["Parch"].astype(object)

In [308]:
dummies = pd.get_dummies(passengers[dummy_columns], drop_first = True, dtype = int)
passengers = pd.concat([passengers, dummies], axis = 1)

In [309]:
passengers = passengers.drop(dummy_columns, axis = 1)

In [310]:
X = passengers.drop(['Survived'], axis = 1)
y = passengers.Survived

In [311]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [312]:
print('Train X Shape: ', X_train.shape)
print('Test X Shape: ', X_test.shape)
print('Train y Shape: ', y_train.shape)
print('Test y Shape: ', y_test.shape)

Train X Shape:  (620, 19)
Test X Shape:  (155, 19)
Train y Shape:  (620,)
Test y Shape:  (155,)


In [313]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [314]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)

In [315]:
print('Train X Shape: ', X_train.shape)
print('Test X Shape: ', X_test.shape)
print('Train y Shape: ', y_train.shape)
print('Test y Shape: ', y_test.shape)

Train X Shape:  (620, 19)
Test X Shape:  (155, 19)
Train y Shape:  (620,)
Test y Shape:  (155,)


In [316]:
estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=8)
selector.fit(X_train_scaled, y_train)
selected_cols = X_train_scaled.columns[selector.support_]

In [317]:
X_train = X_train_scaled[selected_cols]
X_test = X_test_scaled[selected_cols]

In [318]:
print('Train X Shape: ', X_train.shape)
print('Test X Shape: ', X_test.shape)
print('Train y Shape: ', y_train.shape)
print('Test y Shape: ', y_test.shape)

Train X Shape:  (620, 8)
Test X Shape:  (155, 8)
Train y Shape:  (620,)
Test y Shape:  (155,)


In [319]:
X_train_sm = sm.add_constant(X_train, has_constant="add")
X_test_sm = sm.add_constant(X_test, has_constant="add")

model1 = sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
result = model1.fit()
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,620.0
Model:,GLM,Df Residuals:,611.0
Model Family:,Binomial,Df Model:,8.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-283.84
Date:,"Sun, 01 Sep 2024",Deviance:,567.69
Time:,22:43:51,Pearson chi2:,650.0
No. Iterations:,21,Pseudo R-squ. (CS):,0.3514
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.4294,0.400,3.571,0.000,0.645,2.214
Sex,2.5259,0.223,11.319,0.000,2.089,2.963
Age,-3.7285,0.727,-5.130,0.000,-5.153,-2.304
Pclass_2,-1.0576,0.303,-3.489,0.000,-1.652,-0.463
Pclass_3,-2.2601,0.289,-7.832,0.000,-2.826,-1.695
SibSp_3,-2.1711,0.960,-2.261,0.024,-4.054,-0.289
SibSp_4,-1.7191,0.750,-2.292,0.022,-3.189,-0.249
SibSp_5,-22.0974,1.95e+04,-0.001,0.999,-3.82e+04,3.81e+04
Parch_5,-21.8008,2.41e+04,-0.001,0.999,-4.73e+04,4.72e+04


In [320]:
X_train_sm = X_train_sm.drop(['Parch_5'], axis = 1)
X_test_sm = X_test_sm.drop(['Parch_5'], axis = 1)

model2 = sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
result = model2.fit()
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,620.0
Model:,GLM,Df Residuals:,612.0
Model Family:,Binomial,Df Model:,7.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-285.17
Date:,"Sun, 01 Sep 2024",Deviance:,570.33
Time:,22:43:51,Pearson chi2:,657.0
No. Iterations:,20,Pseudo R-squ. (CS):,0.3486
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.4744,0.400,3.687,0.000,0.691,2.258
Sex,2.5073,0.222,11.273,0.000,2.071,2.943
Age,-3.8134,0.727,-5.249,0.000,-5.237,-2.389
Pclass_2,-1.0654,0.303,-3.516,0.000,-1.659,-0.471
Pclass_3,-2.2938,0.288,-7.960,0.000,-2.859,-1.729
SibSp_3,-2.1687,0.960,-2.259,0.024,-4.050,-0.287
SibSp_4,-1.7095,0.749,-2.282,0.022,-3.178,-0.241
SibSp_5,-21.0853,1.18e+04,-0.002,0.999,-2.32e+04,2.32e+04


In [321]:
X_train_sm = X_train_sm.drop(['SibSp_5'], axis = 1)
X_test_sm = X_test_sm.drop(['SibSp_5'], axis = 1)

model3 = sm.GLM(np.array(y_train),X_train_sm,family=sm.families.Binomial())
result = model3.fit()
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,620.0
Model:,GLM,Df Residuals:,613.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-287.44
Date:,"Sun, 01 Sep 2024",Deviance:,574.88
Time:,22:43:51,Pearson chi2:,661.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.3439
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.4062,0.395,3.557,0.000,0.631,2.181
Sex,2.5076,0.222,11.317,0.000,2.073,2.942
Age,-3.6727,0.716,-5.126,0.000,-5.077,-2.269
Pclass_2,-1.0485,0.302,-3.471,0.001,-1.641,-0.456
Pclass_3,-2.3080,0.288,-8.012,0.000,-2.873,-1.743
SibSp_3,-2.1217,0.960,-2.209,0.027,-4.004,-0.239
SibSp_4,-1.6395,0.748,-2.193,0.028,-3.105,-0.174


In [322]:
vif = pd.DataFrame()
vif["Feature"] = X_train_sm.columns
vif['VIF'] = [variance_inflation_factor(X_train_sm, i) for i in range(X_train_sm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by= 'VIF', ascending=False)
vif

Unnamed: 0,Feature,VIF
0,const,14.74
4,Pclass_3,1.7
3,Pclass_2,1.5
2,Age,1.28
6,SibSp_4,1.09
1,Sex,1.03
5,SibSp_3,1.02


In [325]:
y_pred_train = result.predict(X_train_sm)
y_pred_test = result.predict(X_test_sm)

In [329]:
y_pred_train
train = pd.DataFrame(y_pred_train, columns=["Predicted_prob"])
train["Actual"] = y_train
train["Predicted_Class"] = np.where(train["Predicted_prob"] > 0.5, 1.0)
# train = train.dropna()

ValueError: either both or neither of x and y should be given