In [1]:
import os
import sys
import matplotlib.pyplot as plt  

Review Dataset, Drop Cols or Impute Missing Values,and One-Hot Encode Categorical Columns 
==========================================================================================

In [65]:
import pandas as pd
import numpy as np 

TRAIN_DS_PATH = '../data/titanic_train.csv'
TEST_DS_PATH = '../data/titanic_test.csv'

In [96]:
train_ds = pd.read_csv(TRAIN_DS_PATH)

train_ds.info()

train_ds = train_ds.drop(columns=['PassengerId', 'Ticket'])
train_ds.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


Our training dataset consists of 891 entries. We have 5 numerical and 4 categorical input/feature columns. 'PassangerId', 'Ticket', and 'Name' will not be useful for our model. The target column is going to be 'Survived'.

In [89]:
train_ds.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [90]:
train_ds['Embarked'].mode()

0    S
Name: Embarked, dtype: object

Of the 11 feature columns 3 contain missing values. 'Age' has 177/891 missing values and 'Embarked' contains 2/891 missing values. We can imput the mean for the numeric column and mode in the categorical column. Since 'Cabin' is missing a majority of it's values, we can drop it entirely. 

In [91]:
train_ds.select_dtypes('object').nunique()

Name        891
Sex           2
Cabin       147
Embarked      3
dtype: int64

Sex is either 'male' or 'female' and embarked is either 'S', 'Q', or 'C'. We can one-hot encode these values. Name can be searched for special titles.

In [97]:
n = len(train_ds)

train_ds['Age'].fillna(train_ds['Age'].median(), inplace=True)
train_ds['Embarked'].fillna(train_ds['Embarked'].mode(), inplace=True)

train_ds['male'] = (train_ds['Sex'] == 'male').astype(int)
train_ds['female'] = (train_ds['Sex'] == 'female').astype(int)

train_ds['Q'] = (train_ds['Embarked'] == 'Q').astype(int)
train_ds['S'] = (train_ds['Embarked'] == 'S').astype(int)
train_ds['C'] = (train_ds['Embarked'] == 'C').astype(int)

train_ds['FamilySize'] = train_ds['SibSp'] + train_ds['Parch'] + 1
train_ds['IsAlone'] = (train_ds['FamilySize'] == 1).astype(int)

train_ds['AgeBin'] = pd.qcut(train_ds['Age'], q=4, labels=[0, 1, 2, 3]).astype(int)
train_ds['FareBin'] = pd.qcut(train_ds['Fare'], q=4, labels=[0, 1, 2, 3]).astype(int)

train_ds.drop(columns=['Sex', 'Embarked', 'Cabin', 'Name'], inplace=True)
train_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         891 non-null    float64
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   male        891 non-null    int64  
 7   female      891 non-null    int64  
 8   Q           891 non-null    int64  
 9   S           891 non-null    int64  
 10  C           891 non-null    int64  
 11  FamilySize  891 non-null    int64  
 12  IsAlone     891 non-null    int64  
 13  AgeBin      891 non-null    int64  
 14  FareBin     891 non-null    int64  
dtypes: float64(2), int64(13)
memory usage: 104.5 KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_ds['Age'].fillna(train_ds['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_ds['Embarked'].fillna(train_ds['Embarked'].mode(), inplace=True)


Solution
====================================

In [79]:
train_ds.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,male,female,Q,S,C,FamilySize,IsAlone,AgeBin,FareBin
0,0,3,22.0,1,0,7.25,1,0,0,1,0,2,0,0,0
1,1,1,38.0,1,0,71.2833,0,1,0,0,1,2,0,3,3
2,1,3,26.0,0,0,7.925,0,1,0,1,0,1,1,1,1
3,1,1,35.0,1,0,53.1,0,1,0,1,0,2,0,2,3
4,0,3,35.0,0,0,8.05,1,0,0,1,0,1,1,2,1


Manual Solution
=============================================

In [98]:
from sklearn.metrics import log_loss

y = train_ds['Survived'].to_numpy()
X = train_ds.drop(columns='Survived').copy()
X['Intercept'] = 1
X = X.to_numpy()
m, n = X.shape

a = 0.0035
tolerance = 1e-6
beta = np.zeros(n)
m = len(y)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

itr = 10000
for i in range(itr):
    z = X @ beta
    preds = sigmoid(z)
    err = preds - y
    gradient = (1 / m) * (X.T @ err)
    beta -= a * gradient

    clipped = np.clip(preds, 1e-15, 1 - 1e-15) # prevent integer overflow
    bce = log_loss(y, clipped)
    if i % 100 == 0:
        # print(f"Binary Cross Entrophy Loss: {bce}")
        ...

    if np.linalg.norm(gradient) < tolerance:
        print(f"Converged in {i+1} iters with gradient norm={np.linalg.norm(gradient):.2e}")
        break

print(f"Beta: {beta}")
print(f"Final Binary Cross Entrophy Loss: {bce}")

Beta: [-0.21673466 -0.03264057 -0.35487152 -0.11180154  0.00347077 -1.00983288
  1.33298021  0.16685903 -0.0852813   0.22616958 -0.14352572 -0.08787531
  0.15930362  0.58565525  0.32314734]
Final Binary Cross Entrophy Loss: 0.4461553924313609


SKLearn Solution
=============

In [None]:
from sklearn.linear_model import LogisticRegression

y = train_ds['Survived'].to_numpy()
X = train_ds.drop(columns='Survived').to_numpy()

model = LogisticRegression()
model.fit(X, y)

probs = model.predict_proba(X)[:, 1]

bce = log_loss(y, probs)
beta = model.coef_

print(f"Beta: {beta}")
print(f"Final Binary Cross Entrophy Loss: {bce}")

Beta: [[-7.19188853e-01 -7.49852259e-02 -7.18303954e-01 -5.06220806e-01
   1.16967982e-03 -6.56016604e-01  2.07104807e+00  6.05704092e-01
   1.54775536e-01  5.96319791e-01  1.90506710e-01 -1.01574168e-01
   5.01189477e-01  4.06528007e-01]]
Final Binary Cross Entrophy Loss: 0.43358194672924244


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
