## Import Libraries

In [74]:
import numpy as np
import pandas as pd
import zipfile

## Data Downloading

In [75]:
zip_path = "/content/titanic.zip"
extract_to = "data/"
# open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # extract specific files
    zip_ref.extract("train.csv", extract_to)
    zip_ref.extract("test.csv", extract_to)

In [76]:
# take the train data and test data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

## Data Preprocessing

In [77]:
def preprocess_data(train, test):
    # extract title using regex for robustness
    train['Title'] = train['Name'].str.extract(r',\s*([^\.]+)\.')[0]
    test['Title'] = test['Name'].str.extract(r',\s*([^\.]+)\.')[0]

    title_list = ['Mr', 'Miss', 'Master', 'Mrs']

    # replace uncommon titles with 'Misc'
    train['Title'] = train['Title'].apply(lambda x: x if x in title_list else 'Misc')
    test['Title'] = test['Title'].apply(lambda x: x if x in title_list else 'Misc')

    # drop unnecessary columns
    drop_cols = ['PassengerId', 'Ticket', 'Name', 'SibSp', 'Parch', 'Cabin']
    train.drop(columns=drop_cols, inplace=True, errors='ignore')
    test.drop(columns=drop_cols, inplace=True, errors='ignore')

    # encode categorical variables to 0 and 1 for "Sex" column and "Embarked" column
    train["Sex"] = train["Sex"].map({"male": 0, "female": 1})
    test["Sex"] = test["Sex"].map({"male": 0, "female": 1})

    train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])
    test["Embarked"] = test["Embarked"].fillna(train["Embarked"].mode()[0])

    # fill missing Fare values using train median
    train['Fare'] = train['Fare'].fillna(train['Fare'].median())
    test['Fare'] = test['Fare'].fillna(train['Fare'].median())  # Using train's median for consistency

    # fill missing Age values based on Title median from training set
    title_age_median = train.groupby('Title')['Age'].median()
    train['Age'] = train.apply(lambda row: title_age_median[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)
    test['Age'] = test.apply(lambda row: title_age_median[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)  # Using train's median

    return train, test


## Model Implementation
#### Using sigmoid and gradient descent

In [78]:
# sigmoid activation function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# train logistic regression using gradient descent
def fit(X, y, learning_rate=0.01, epochs=20000):
    # convert input data to NumPy arrays and ensure they are of float type
    X = np.array(X).astype(float)
    y = np.array(y).astype(float)

    # get the number of samples (m) and features (n)
    m, n = X.shape
    # initial weights and bias with zero
    weights = np.zeros(n)
    bias = 0

    # perform gradient descent for 'epochs' iterations
    for _ in range(epochs):
        linear_model = np.dot(X, weights) + bias
        predictions = sigmoid(linear_model)

        # compute gradients
        dw = (1 / m) * np.dot(X.T, (predictions - y)) # derivative weights
        db = (1 / m) * np.sum(predictions - y)  # derivative bias

        # update parameters
        weights -= learning_rate * dw
        bias -= learning_rate * db

    return weights, bias

# make predictions using the trained logistic regression model
def predict(X, weights, bias):
    # convert input data to NumPy array and ensure it's float type
    X = np.array(X).astype(float)
    # compute the linear combination of inputs and trained weights
    linear_model = np.dot(X, weights) + bias
    # apply sigmoid activation to get probability predictions
    predictions = sigmoid(linear_model)
    # convert probabilities to binary class labels (0 or 1) with threshold 0.5
    return (predictions >= 0.5).astype(int)


In [79]:
# call function preprocess data
train_processed, test_processed = preprocess_data(train_data, test_data)

# apply log transformation to numeric columns
numeric_cols = ['Age', 'Fare']
train_processed[numeric_cols] = np.log1p(train_processed[numeric_cols])
test_processed[numeric_cols] = np.log1p(test_processed[numeric_cols])

# using one-hot encoding categorical variables
train_processed = pd.get_dummies(train_processed).astype(float)
test_processed = pd.get_dummies(test_processed).astype(float)

# separate features and target variable
X_train = train_processed.drop(columns=['Survived'])
y_train = train_processed['Survived']

# align train and test sets to have the same features
X_train, X_test = X_train.align(test_processed, join='left', axis=1, fill_value=0)

# convert to NumPy arrays
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values

## Training Process

In [80]:
# train logistic regression model
weights, bias = fit(X_train, y_train, epochs=800000)

# make predictions
y_pred = predict(X_test, weights, bias)


## Export CSV file

In [81]:
# create a DataFrame for submission
submission = pd.DataFrame({
    'PassengerId': test_data.index + 892, # PassengerId index
    'Survived': y_pred  # predictions
})
# extract csv file
submission.to_csv("submission.csv", index=False)
print("Submission file created!")

Submission file created!


## KAGGLE SUBMISSION
#### Link submission: https://drive.google.com/file/d/1BX7FpIXofd55rRdFm2bUyLGIg2RO35yK/view?usp=sharing