In [12]:
!kaggle competitions download -c titanic -p .

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [13]:
import numpy as np
import pandas as pd
import zipfile


In [14]:
with zipfile.ZipFile("titanic.zip", "r") as zip_ref:
    zip_ref.extractall(".")  


In [115]:
train = pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [116]:
# extract Title from Name
train['Title'] = train['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
test['Title'] = test['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
train['Title'].value_counts()

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

In [117]:
allowed_titles = ['Mr', 'Miss', 'Master', 'Mrs']

# replace uncommon titles with 'Misc'
train['Title'] = train['Title'].apply(lambda x: x if x in allowed_titles else 'Misc')
test['Title'] = test['Title'].apply(lambda x: x if x in allowed_titles else 'Misc')

print(train['Title'].value_counts())
print(test['Title'].value_counts())

Title
Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: count, dtype: int64
Title
Mr        240
Miss       78
Mrs        72
Master     21
Misc        7
Name: count, dtype: int64


In [118]:
train.drop(columns=['PassengerId','Ticket','Name','Cabin','SibSp','Parch'],inplace=True)
test.drop(columns=['PassengerId','Ticket','Name','Cabin','SibSp','Parch'],inplace=True)

train["Sex"] = train["Sex"].map({"male": 0, "female": 1})
test["Sex"] = test["Sex"].map({"male": 0, "female": 1})

train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])
test["Embarked"] = test["Embarked"].fillna(test["Embarked"].mode()[0])


# train['Fare']=train['Fare'].fillna(train['Fare'].mean())
# test['Fare']=test['Fare'].fillna(train['Fare'].mean())

# title_age_mean = train.groupby('Title')['Age'].mean()
# train['Age'] = train.apply(lambda row: title_age_mean[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)
# title_age_mean = test.groupby('Title')['Age'].mean()
# test['Age'] = test.apply(lambda row: title_age_mean[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)


train['Fare']=train['Fare'].fillna(train['Fare'].median())
test['Fare']=test['Fare'].fillna(train['Fare'].median())
title_age_mean = train.groupby('Title')['Age'].median()
train['Age'] = train.apply(lambda row: title_age_mean[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)
title_age_mean = test.groupby('Title')['Age'].median()
test['Age'] = test.apply(lambda row: title_age_mean[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)


In [119]:
numeric_cols = ['Age','Fare']
train[numeric_cols] = np.log1p(train[numeric_cols])
test[numeric_cols] = np.log1p(test[numeric_cols])



In [120]:
train = pd.get_dummies(train).astype(float)
test = pd.get_dummies(test).astype(float)

X_train = train.drop(columns=['Survived'])
y_train = train['Survived']  

X_train, X_test = X_train.align(test, join='left', axis=1, fill_value=0)

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values


In [121]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def fit(X,y, learning_rate=0.01,epochs=1000):
    m, n = X.shape
    weights = np.zeros(n)
    bias=0
    for _ in range(epochs):
        linear_model = np.dot(X, weights) + bias
        predictions = sigmoid(linear_model)
        dw = (1 / m) * np.dot(X.T, (predictions - y))
        db = (1 / m) * np.sum(predictions - y)
        weights -= learning_rate * dw
        bias -= learning_rate * db
    return weights,bias

def predict(X,weights,bias):
    linear_model = np.dot(X, weights) + bias
    predictions = sigmoid(linear_model)
    return (predictions >= 0.5).astype(int)


In [122]:
weights,bias=fit(X_train,y_train,epochs=500000)
print(weights)
print(bias)

[-1.26983973  3.63072777 -0.4137324  -0.17476546  0.93074158  0.89364274
  0.32045106  2.31593002  0.11413606 -0.61773652  0.36084967 -0.02834385]
2.144835369307404


In [112]:
y_pred=predict(X_test,weights,bias)
submission = pd.DataFrame({'PassengerId': test.index + 892, 'Survived': y_pred})
submission.to_csv('submission.csv', index=False)