In [2]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")


In [3]:

# Combine for consistent preprocessing (except 'Survived')
full_data = pd.concat([train.drop('Survived', axis=1), test], axis=0, ignore_index=True)


In [4]:

# ===== 1. Fill missing 'Embarked' with mode =====
full_data['Embarked'] = full_data['Embarked'].fillna(full_data['Embarked'].mode()[0])


In [5]:

# ===== 2. Fill missing 'Fare' in test with median by Pclass & Embarked =====
full_data['Fare'] = full_data.groupby(['Pclass', 'Embarked'])['Fare'] \
                             .apply(lambda x: x.fillna(x.median()))


TypeError: incompatible index of inserted column with frame index

In [6]:

# ===== 3. Fill missing 'Age' using median of Title & Pclass =====
full_data['Age'] = full_data.groupby(['Title', 'Pclass'])['Age'] \
                            .apply(lambda x: x.fillna(x.median()))


KeyError: 'Title'

In [7]:

# ===== 4. Fill missing 'Deck' with 'U' (Unknown) =====
full_data['Deck'] = full_data['Deck'].fillna('U')


KeyError: 'Deck'

In [None]:

# ===== 5. Fill missing 'Cabin' with 'Unknown' =====
full_data['Cabin'] = full_data['Cabin'].fillna('Unknown')


In [None]:

# ===== 6. Fill missing 'Ticket_prefix' with 'NONE' =====
full_data['Ticket_prefix'] = full_data['Ticket_prefix'].fillna('NONE')



In [None]:

# ===== 7. Fill missing 'Ticket_number' with -1 =====
full_data['Ticket_number'] = full_data['Ticket_number'].fillna(-1)


In [None]:

# ===== 8. Fill any remaining NaN in categorical columns with 'Unknown' =====
cat_cols = full_data.select_dtypes(include='object').columns
full_data[cat_cols] = full_data[cat_cols].fillna('Unknown')


In [None]:

# ===== 9. Fill any remaining NaN in numeric columns with median =====
num_cols = full_data.select_dtypes(include=[np.number]).columns
full_data[num_cols] = full_data[num_cols].fillna(full_data[num_cols].median())


In [None]:

# ===== 10. Split back into train/test =====
train_processed = full_data.iloc[:len(train), :]
test_processed = full_data.iloc[len(train):, :]


In [None]:

# Add back target variable
train_processed['Survived'] = train['Survived']


In [None]:

# Save cleaned files
train_processed.to_csv("train_clean.csv", index=False)
test_processed.to_csv("test_clean.csv", index=False)