# Section A: Problem Definition & Data Preparation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## 1. Target Selection

The primary objective of this project is to predict passenger survival on the Titanic, making the 'Survived' column our classification target.

## 2. Data Preprocessing Pipeline

In [2]:
df = pd.read_csv('Titanic-Dataset.csv')

### Drop unnecessary columns

In [3]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

### Impute missing values

In [4]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

### Encode categorical variables

In [5]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

### Create stratified train/test split

In [6]:
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 3. Expected Output

In [7]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (712, 8)
X_test shape: (179, 8)
y_train shape: (712,)
y_test shape: (179,)
