In [205]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.model_selection import train_test_split

In [206]:
data = pd.read_csv("titanic.csv")

In [207]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [208]:
df = data.copy()

In [209]:
df.shape

(418, 12)

In [210]:
df['Sex'] = df['Sex'].replace({'male':1, 'female': 2})

In [211]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",2,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",2,22.0,1,1,3101298,12.2875,,S


In [212]:
df['Cabin'].isna().sum()

327

In [213]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    int64  
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 39.3+ KB


In [214]:
df['Cabin'].unique()

array([nan, 'B45', 'E31', 'B57 B59 B63 B66', 'B36', 'A21', 'C78', 'D34',
       'D19', 'A9', 'D15', 'C31', 'C23 C25 C27', 'F G63', 'B61', 'C53',
       'D43', 'C130', 'C132', 'C101', 'C55 C57', 'B71', 'C46', 'C116',
       'F', 'A29', 'G6', 'C6', 'C28', 'C51', 'E46', 'C54', 'C97', 'D22',
       'B10', 'F4', 'E45', 'E52', 'D30', 'B58 B60', 'E34', 'C62 C64',
       'A11', 'B11', 'C80', 'F33', 'C85', 'D37', 'C86', 'D21', 'C89',
       'F E46', 'A34', 'D', 'B26', 'C22 C26', 'B69', 'C32', 'B78',
       'F E57', 'F2', 'A18', 'C106', 'B51 B53 B55', 'D10 D12', 'E60',
       'E50', 'E39 E41', 'B52 B54 B56', 'C39', 'B24', 'D28', 'B41', 'C7',
       'D40', 'D38', 'C105'], dtype=object)

In [215]:
data['Embarked'].unique()

array(['Q', 'S', 'C'], dtype=object)

In [216]:
data['Embarked'].isna().value_counts()

False    418
Name: Embarked, dtype: int64

In [217]:
label_encoder = LabelEncoder()
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])
df['Name'] = label_encoder.fit_transform(df['Name'])
df['Cabin'] = label_encoder.fit_transform(df['Cabin'])
df['Ticket'] = label_encoder.fit_transform(df['Ticket'])

In [218]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,206,1,34.5,0,0,152,7.8292,76,1
1,893,1,3,403,2,47.0,1,0,221,7.0,76,2
2,894,0,2,269,1,62.0,0,0,73,9.6875,76,1
3,895,0,3,408,1,27.0,0,0,147,8.6625,76,2
4,896,1,3,178,2,22.0,1,1,138,12.2875,76,2


In [219]:
df.fillna({
    'Age': df['Age'].mode().mean(),
    'Fare': df['Fare'].mode().mean(),
    'Cabin': df.Cabin.mode().mean()
},inplace = True)

In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    int32  
 4   Sex          418 non-null    int64  
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    int32  
 9   Fare         418 non-null    float64
 10  Cabin        418 non-null    int32  
 11  Embarked     418 non-null    int32  
dtypes: float64(2), int32(4), int64(6)
memory usage: 32.8 KB


In [221]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,0.363636,2.26555,208.5,1.363636,28.673445,0.447368,0.392344,180.944976,35.560497,67.437799,1.401914
std,120.810458,0.481622,0.841838,120.810458,0.481622,13.020267,0.89676,0.981429,107.533763,55.857145,19.091405,0.854496
min,892.0,0.0,1.0,0.0,1.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0
25%,996.25,0.0,1.0,104.25,1.0,22.5,0.0,0.0,85.25,7.8958,76.0,1.0
50%,1100.5,0.0,3.0,208.5,1.0,24.0,0.0,0.0,181.0,14.4542,76.0,2.0
75%,1204.75,1.0,3.0,312.75,2.0,35.75,1.0,0.0,279.75,31.471875,76.0,2.0
max,1309.0,1.0,3.0,417.0,2.0,76.0,8.0,9.0,362.0,512.3292,76.0,2.0


In [222]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,206,1,34.5,0,0,152,7.8292,76,1
1,893,1,3,403,2,47.0,1,0,221,7.0,76,2
2,894,0,2,269,1,62.0,0,0,73,9.6875,76,1
3,895,0,3,408,1,27.0,0,0,147,8.6625,76,2
4,896,1,3,178,2,22.0,1,1,138,12.2875,76,2


In [223]:
# df.drop(['PassengerId','Name'],axis =1, inplace=True)

In [224]:
X = df.drop(['Survived'], axis =1)
y = df['Survived']

In [225]:
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,206,1,34.5,0,0,152,7.8292,76,1
1,893,3,403,2,47.0,1,0,221,7.0000,76,2
2,894,2,269,1,62.0,0,0,73,9.6875,76,1
3,895,3,408,1,27.0,0,0,147,8.6625,76,2
4,896,3,178,2,22.0,1,1,138,12.2875,76,2
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,353,1,22.5,0,0,267,8.0500,76,2
414,1306,1,283,2,39.0,0,0,324,108.9000,22,0
415,1307,3,332,1,38.5,0,0,346,7.2500,76,2
416,1308,3,384,1,22.5,0,0,220,8.0500,76,2


In [226]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [227]:
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [228]:
def sigmoid(z):
    z = np.clip(z, -500, 500)
    g = 1/(1 + np.exp(-z))
    return g

In [229]:
def costFunc(X,y,w,b):
    m = X.shape[0]
    cost = 0.0
    epsilon = 1e-15  # A small constant to avoid division by zero or negative values

    for i in range(m):
        z = np.dot(X[i],w)+ b
        y_hat = sigmoid(z)
        cost += -y[i] * np.log(y_hat + epsilon) - (1 - y[i] ) * np.log(1 - y_hat + epsilon)
    cost = cost/m
    
    return cost

In [230]:
# # w = np.random.rand(11)
# # w_init = [round(i, 2) for i in w]
# w_init = [0.24, 0.15, 0.15, 0.57, 0.54, 0.92, 0.09, 0.25, 0.29, 0.21, 0.03]
# b_tmp = 0
# w_tmp =np.array(w_init)
# costFunc(x_train, y_train, w_tmp, b_tmp)

In [231]:
def computeGradient(X,y,w,b):
    m,n = X.shape
    
    dj_dw = np.zeros((n,))
    dj_db = 0.0
    
    for i in range(m):
        y_hat = np.dot(X[i],w)+ b
        err = y_hat - y[i]
        for j in range(n):
            dj_dw[j] += dj_dw[j] + err * X[i,j]
        dj_db += err
        
    dj_dw = dj_dw/m
    dj_db = dj_db/m
    
    return dj_dw, dj_db

In [232]:
def gradientDescent(X,y,w,b,lr,itr):
    J_hist = []
    p_hist = []
    
    m = X.shape[0]
    
    for i in range(m):
        dj_dw , dj_db = computeGradient(X,y,w,b)
    w = w - lr * dj_dw
    b = b - lr * dj_db
    
    if i < itr:
        J_hist.append(costFunc(X,y,w,b))
        
    if i% math.ceil(itr / 10) == 0:
        print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
    return w, b , J_hist

In [233]:
b = -1
w =np.zeros_like(x_train[0])
lr = 0.001
iterations = 10000

wf, bf, cf = gradientDescent(x_train,y_train,w,b,lr,iterations)
print(f"The final w is {wf}")
print(f"The final b is: {bf:.2f}")
print(f"The final Cost is: {cf[-1]:3f}")

The final w is [1.16582241e+98 2.04298749e+95 3.33572680e+97 1.04855449e+95
 2.89226221e+96 6.04600446e+94 5.11813846e+92 1.40462270e+97
 2.28155476e+96 7.54600035e+96 1.94459300e+95]
The final b is: -1.00
The final Cost is: 22.336454
