# Titanic

The main goal os the competition is to predict which passengers survides the Titanic shipwreck.

* Link to the competition: https://www.kaggle.com/competitions/titanic

## Get Data

We had two different files:
* train.csv
* test.csv

We will need to predic the value of `Survived` feature on the test dataset.

In [None]:
pip install kaggle

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler


In [None]:
from google.colab import userdata

# Retrieve credentials
KAGGLE_KEY =  userdata.get('KAGGLE_KEY')
KAGGLE_USERNAME = userdata.get('KAGGLE_USERNAME')

# Set environmental variables with %env to better work with kaggle
%env KAGGLE_USERNAME=$KAGGLE_USERNAME
%env KAGGLE_KEY=$KAGGLE_KEY

In [None]:
!kaggle competitions download -c titanic

In [None]:
!unzip /content/titanic.zip

## Inspect Data

In [None]:
import pandas as pd
test_df = pd.read_csv('/content/test.csv')
train_df = pd.read_csv('/content/train.csv')
test_ids = test_df["PassengerId"].copy()

In [None]:
# Check the train_df
train_df

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
# Check how many data is missing
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

## Prepare Data

In [None]:
# Calculate median for Age column
age_median = train_df["Age"].median()
# Calculate Fare for test_df
fare_median = train_df["Fare"].median()
# Calculate mode for Embarked column
embarked_mode = train_df["Embarked"].mode()[0]

# Fill NA values
train_df["Age"].fillna(age_median, inplace=True)
test_df["Age"].fillna(age_median, inplace=True)

test_df["Fare"].fillna(fare_median, inplace=True)

train_df["Embarked"].fillna(embarked_mode, inplace=True)

In [None]:
# Create a new column called Has_cabin
train_df["Has_cabin"] = train_df["Cabin"].notna().astype(int)
test_df["Has_cabin"] = test_df["Cabin"].notna().astype(int)

# Deck: first letter or M for missing
train_df["Deck"] = train_df["Cabin"].str[0].fillna("M") # M = missing
test_df["Deck"] = test_df["Cabin"].str[0].fillna("M")

In [None]:
# Extract Title from Name
def extract_title(name):
  if pd.isna(name):
    return "Unknown"
  try:
    # Last, Title. First
    return name.split(",")[1].split(".")[0].strip()
  except:
    return "Unknown"

train_df["Title"] = train_df["Name"].apply(extract_title)
test_df["Title"] = test_df["Name"].apply(extract_title)

In [None]:
# Create rare titles column
title_counts = train_df['Title'].value_counts()
rare_titles = set(title_counts[title_counts < 10].index)
train_df['Title'] = train_df['Title'].apply(lambda t: 'Rare' if t in rare_titles else t)
test_df['Title']  = test_df['Title'].apply(lambda t: 'Rare' if t in rare_titles else t)


In [None]:
train_df['Title'].value_counts()

In [None]:
# Create Family features
train_df["Family_Size"] = train_df["SibSp"] + train_df["Parch"] + 1
test_df["Family_Size"] = test_df["SibSp"] + test_df["Parch"] + 1

# Create Family group based on Family size
train_df["Family_Group"] = train_df["Family_Size"].apply(lambda x: "Alone" if x ==1  else ("Small" if x <5 else "Big"))
test_df["Family_Group"] = test_df["Family_Size"].apply(lambda x: "Alone" if x ==1  else ("Small" if x <5 else "Big"))



In [None]:
# Drop columns we don't want
drop_train_cols = ['Name', 'Ticket', 'Cabin']   # we keep PassengerId in train only if you want debugging; usually drop before fit
drop_test_cols  = ['Name', 'Ticket', 'Cabin']

train_df = train_df.drop(columns=[c for c in drop_train_cols if c in train_df.columns])
test_df  = test_df.drop(columns=[c for c in drop_test_cols  if c in test_df.columns])


In [None]:
# Concat and get_dumines
n_train = len(train_df)
# Make a copy of Survived to reattach later
y = train_df['Survived'].astype(int)

# concat train WITHOUT Survived and test
combined = pd.concat([train_df.drop(columns=['Survived']), test_df], axis=0, sort=False)

# One-hot encode. We already filled Deck missing with 'M' so no dummy_na needed.
combined = pd.get_dummies(combined, drop_first=True)

# Split back
train_proc = combined.iloc[:n_train].copy()
test_proc  = combined.iloc[n_train:].copy()

# Reattach Survived to train_proc
train_proc['Survived'] = y.values

# Safety: ensure test has same columns as train (excluding Survived)
test_proc = test_proc.reindex(columns=[c for c in train_proc.columns if c != 'Survived'], fill_value=0)



## Split data

In [None]:
# 8) Now create feature matrix X and labels y (and keep test_X for final preds)
# -------------------------
X_full = train_proc.drop(columns=['Survived'])
y_full = train_proc['Survived']

X_test_for_submission = test_proc.copy()

In [None]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, stratify=y_full, random_state=42
)


In [None]:
# 10) Scale numeric columns FITTING THE SCALER ON X_train ONLY
# -------------------------
from sklearn.preprocessing import StandardScaler
numerical_cols = ['Age', 'Fare']

# Check numeric columns exist (if not, warn)
for c in numerical_cols:
    if c not in X_train.columns:
        raise KeyError(f"Numeric column {c} not present in training features: {X_train.columns.tolist()[:20]}")

scaler = StandardScaler()
X_train.loc[:, numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val.loc[:, numerical_cols]   = scaler.transform(X_val[numerical_cols])
X_test_for_submission.loc[:, numerical_cols] = scaler.transform(X_test_for_submission[numerical_cols])


## Start Modelling

In [None]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Put models in a dicctionary
models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [None]:
# Create a function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data (no labels)
    X_test : testing data (no labels)
    y_train : training labels
    y_test : test labels
    """
    # Set random seed
    np.random.seed(42)

    # Make dictinoary to keep model scores
    model_scores = {}

    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
train_df.isnull().sum()

In [None]:
model_results = fit_and_score(models, X_train, X_val, y_train, y_val)

In [None]:
model_results

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the grid of parameters to search
param_grid = {
    "penalty": ["l1", "l2"],
    "C": [0.01, 0.1, 1, 10],
    "solver": ["saga"],
    "max_iter": [200, 500]
}

# Instantiate the Grid search object
gscv = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    cv=5,
    verbose=1,
    scoring = "accuracy"
)

gscv.fit(X_train, y_train)

In [None]:
# Train model with this information
best_params = gscv.best_params_
final_model = LogisticRegression(**{k: v for k, v in best_params.items() if k in LogisticRegression().get_params()})
final_model.random_state = 42
final_model.max_iter = max(final_model.get_params().get('max_iter', 100), 300)
final_model.fit(X_full, y_full)


## Make predictions

In [None]:
# 14) Prepare test set for prediction and create submission
# -------------------------
# test set is already aligned (X_test_for_submission) and scaled earlier
# ensure no Survived or PassengerId in features
if 'Survived' in X_test_for_submission.columns:
    X_test_for_submission = X_test_for_submission.drop(columns=['Survived'])

# Predict labels (Titanic usually expects 0/1 labels)
final_preds = final_model.predict(X_test_for_submission)

## Create a submission file

In [None]:
# Build submission DataFrame
submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": final_preds.astype(int)
})

# Save to csv
submission.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")
