# 🧪 Baseline Model: Titanic Survival Prediction

This notebook contains basic preprocessing and a baseline model (Logistic Regression) for the Titanic Kaggle competition.

## 1. Load Libraries and Data

In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load data
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

## 2. Quick Look at Missing Values

In [24]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [25]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## 3. Basic Data Cleaning & Feature Engineering

In [26]:
# Fill missing Age values with median
train["Age"].fillna(train["Age"].median(), inplace=True)
test["Age"].fillna(test["Age"].median(), inplace=True)

# Fill missing Embarked with mode
train["Embarked"].fillna(train["Embarked"].mode()[0], inplace=True)

# Fill missing Fare in test set with median
test["Fare"].fillna(test["Fare"].median(), inplace=True)

# Convert Sex to numeric
train["Sex"] = train["Sex"].map({"male": 0, "female": 1})
test["Sex"] = test["Sex"].map({"male": 0, "female": 1})


# Convert Embarked using one-hot encoding
train = pd.get_dummies(train, columns=["Embarked"], drop_first=True)
test = pd.get_dummies(test, columns=["Embarked"], drop_first=True)

# Align columns
# train, test = train.align(test, join="left", axis=1, fill_value=0)


# Drop unnecessary columns
train.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)
test_passenger_ids = test["PassengerId"]
test.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Age"].fillna(train["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["Age"].fillna(test["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [27]:
display(train.head())
display(test.head())

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,False,True
1,1,1,1,38.0,1,0,71.2833,False,False
2,1,3,1,26.0,0,0,7.925,False,True
3,1,1,1,35.0,1,0,53.1,False,True
4,0,3,0,35.0,0,0,8.05,False,True


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,3,0,34.5,0,0,7.8292,True,False
1,3,1,47.0,1,0,7.0,False,True
2,2,0,62.0,0,0,9.6875,True,False
3,3,0,27.0,0,0,8.6625,False,True
4,3,1,22.0,1,1,12.2875,False,True


## 4. Train a Baseline Logistic Regression Model

In [28]:
# Split training data into features and target
X = train.drop("Survived", axis=1)
y = train["Survived"]
X, test = X.align(test, join="left", axis=1, fill_value=0)

# split for local validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Evaluate on validation set
val_preds = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_accuracy:.4f}")

Validation Accuracy: 0.8101


## 5. Make Predictions and Prepare Submission File

In [29]:
test_preds = model.predict(test)

submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": test_preds
})

submission.to_csv("../submissions/submission1.csv", index=False)