## Import Libraries

In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score


## Load Data

In [2]:
train_data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")


## Feature Engineering

In [3]:
def get_title(name: str) -> str:
    match = re.search(r",\s*([^\.]+)\.", name)
    return match.group(1).strip() if match else "Unknown"

def build_features(data: pd.DataFrame) -> pd.DataFrame:
    data = data.copy()

    data["Title"] = data["Name"].apply(get_title)

    replacements = {
        "Mlle": "Miss",
        "Ms": "Miss",
        "Mme": "Mrs",
        "Lady": "Rare",
        "Countess": "Rare",
        "Capt": "Rare",
        "Col": "Rare",
        "Don": "Rare",
        "Dr": "Rare",
        "Major": "Rare",
        "Rev": "Rare",
        "Sir": "Rare",
        "Jonkheer": "Rare",
        "Dona": "Rare",
    }
    data["Title"] = data["Title"].replace(replacements)

    data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
    data["IsAlone"] = (data["FamilySize"] == 1).astype(int)

    data["HasCabin"] = data["Cabin"].notna().astype(int)

    data["TicketGroupSize"] = data.groupby("Ticket")["Ticket"].transform("count")

    return data

train_data = build_features(train_data)
test_data = build_features(test_data)


## Target and Feature Selection

In [4]:
y_train = train_data["Survived"].astype(int)

cols_to_drop = ["Survived", "PassengerId", "Name", "Cabin", "Ticket"]
x_train = train_data.drop(columns=cols_to_drop)
x_test = test_data.drop(columns=["PassengerId", "Name", "Cabin", "Ticket"])


## Missing Value Imputation

In [5]:
def fill_missing_age(data: pd.DataFrame) -> pd.DataFrame:
    data = data.copy()
    medians = data.groupby(["Sex", "Pclass"])["Age"].median()

    def age_filler(row):
        if pd.notna(row["Age"]):
            return row["Age"]
        return medians.loc[(row["Sex"], row["Pclass"])]

    data["Age"] = data.apply(age_filler, axis=1)
    return data

x_train = fill_missing_age(x_train)
x_test = fill_missing_age(x_test)

x_train["Fare"] = x_train["Fare"].fillna(x_train["Fare"].median())
x_test["Fare"] = x_test["Fare"].fillna(x_test["Fare"].median())

x_train["Fare"] = np.log1p(x_train["Fare"])
x_test["Fare"] = np.log1p(x_test["Fare"])


## Preprocessing Pipelines

In [6]:
num_cols = ["Age", "Fare", "SibSp", "Parch", "FamilySize", "IsAlone", "HasCabin", "TicketGroupSize"]
cat_cols = ["Pclass", "Sex", "Embarked", "Title"]

num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

num_pipe_raw = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

pre_scaled = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)

pre_basic = ColumnTransformer(
    transformers=[
        ("num", num_pipe_raw, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)


## Model Definitions

In [7]:
models = {
    "Support Vector Machines": (SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42), pre_scaled),
    "KNN": (KNeighborsClassifier(n_neighbors=5), pre_scaled),
    "Logistic Regression": (LogisticRegression(max_iter=2000, random_state=42), pre_scaled),
    "Random Forest": (RandomForestClassifier(n_estimators=300, random_state=42), pre_basic),
    "Naive Bayes": (GaussianNB(), pre_basic),
    "Perceptron": (Perceptron(random_state=42), pre_scaled),
    "Stochastic Gradient Decent": (SGDClassifier(loss="hinge", random_state=42), pre_scaled),
    "Linear SVC": (LinearSVC(max_iter=2000, random_state=42), pre_scaled),
    "Decision Tree": (DecisionTreeClassifier(random_state=42), pre_basic),
}


## Model Evaluation

In [8]:
kaggle_scores = {
    "Random Forest": 86.76,
    "Decision Tree": 86.76,
    "KNN": 84.85,
    "Logistic Regression": 80.36,
    "Linear SVC": 78.90,
    "Perceptron": 78.34,
    "Support Vector Machines": 78.23,
    "Stochastic Gradient Decent": 74.86,
    "Naive Bayes": 72.28,
}

results = []

for name, (clf, prep) in models.items():
    pipe = Pipeline(steps=[
        ("preprocess", prep),
        ("model", clf)
    ])

    if name == "Naive Bayes":
        tr = prep.fit_transform(x_train)
        if hasattr(tr, "toarray"):
            tr = tr.toarray()
        clf.fit(tr, y_train)
        score = round(clf.score(tr, y_train) * 100, 2)
    else:
        pipe.fit(x_train, y_train)
        score = round(pipe.score(x_train, y_train) * 100, 2)

    baseline = kaggle_scores[name]
    results.append((name, baseline, score))

results_df = pd.DataFrame(results, columns=["Model", "Kaggle Score (%)", "Improved Score (%)"]).sort_values(
    by="Improved Score (%)", ascending=False
)

print(results_df.to_string(index=False))


                     Model  Kaggle Score (%)  Improved Score (%)
             Random Forest             86.76               98.65
             Decision Tree             86.76               98.65
                       KNN             84.85               85.86
   Support Vector Machines             78.23               84.18
                Linear SVC             78.90               83.50
       Logistic Regression             80.36               83.28
Stochastic Gradient Decent             74.86               83.05
               Naive Bayes             72.28               82.49
                Perceptron             78.34               72.50
