In [60]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Load the dfset
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')

df = pd.concat([df_train.drop('Survived', axis=1), df_test], axis=0, ignore_index=True)

# Display the first few rows
df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [61]:
import re
# Title extraction from Name (e.g., 'Mr', 'Mrs', 'Miss', etc.)
def extract_title(name: str) -> str:
    m = re.search(r',\s*([^\.]+)\.', str(name))
    return m.group(1).strip() if m else 'Unknown'

df['Title'] = df['Name'].map(extract_title)
title_map = {
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
    'Lady': 'Royal', 'Countess': 'Royal', 'Dona': 'Royal', 'Sir': 'Royal', 'Don': 'Royal',
    'Jonkheer': 'Rare', 'Capt': 'Rare', 'Col': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare'
}
df['Title'] = df['Title'].replace(title_map)

df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [62]:
df['Surname'] = df['Name'].str.split(',').str[0]
df['Title_First_Middle'] = df['Name'].str.split(',').str[1].str.strip()
df['Title_Raw'] = df['Title_First_Middle'].str.split(' ').str[0]
df['First_Middle'] = df['Title_First_Middle'].str.split(' ').str[1:].str.join(' ')
df['First_Middle'] = df['First_Middle'].replace('', pd.NA)  # replace empty strings with NaN
df['First_Middle'] = df['First_Middle'].fillna('Unknown')  # fill NaN with 'Unknown'
df['Surname'] = df['Surname'].str.strip()  # remove leading and trailing spaces
df['Title_Raw'] = df['Title_Raw'].str.replace('.', '', regex=False).str.strip()  # clean raw title
df['First_Middle'] = df['First_Middle'].str.strip()  # remove leading and
df['MaidenName'] = df['Name'].str.extract(r'\((.*?)\)')

df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Title_First_Middle,Title_Raw,First_Middle,MaidenName
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund,Mr. Owen Harris,Mr,Owen Harris,
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings,Mrs. John Bradley (Florence Briggs Thayer),Mrs,John Bradley (Florence Briggs Thayer),Florence Briggs Thayer
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen,Miss. Laina,Miss,Laina,
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle,Mrs. Jacques Heath (Lily May Peel),Mrs,Jacques Heath (Lily May Peel),Lily May Peel
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen,Mr. William Henry,Mr,William Henry,


In [63]:
# mark zero fares but keep rows
df['ZeroFare'] = (df['Fare'] == 0).astype(int)
df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Title_First_Middle,Title_Raw,First_Middle,MaidenName,ZeroFare
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund,Mr. Owen Harris,Mr,Owen Harris,,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings,Mrs. John Bradley (Florence Briggs Thayer),Mrs,John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen,Miss. Laina,Miss,Laina,,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle,Mrs. Jacques Heath (Lily May Peel),Mrs,Jacques Heath (Lily May Peel),Lily May Peel,0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen,Mr. William Henry,Mr,William Henry,,0


In [64]:
# Extract Ticket prefix
df['Ticket_prefix'] = (
    df['Ticket']
    .astype(str)
    .str.replace(r'\d+', '', regex=True)   # remove digits
    .str.replace('.', '', regex=False)     # remove dots
    .str.strip()                           # trim spaces
)

# Replace empty prefixes with 'NUMBER'
df['Ticket_prefix'] = df['Ticket_prefix'].replace('', 'NUMBER')

# Extract numeric part of the Ticket
df['Ticket_number'] = (
    df['Ticket']
    .astype(str)
    .str.extract(r'(\d+)$')[0]             # extract last group of digits
    .astype(float)                         # convert to numeric
)

# Optional: check results
df.head(10)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Title_First_Middle,Title_Raw,First_Middle,MaidenName,ZeroFare,Ticket_prefix,Ticket_number
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Braund,Mr. Owen Harris,Mr,Owen Harris,,0,A/,21171.0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Cumings,Mrs. John Bradley (Florence Briggs Thayer),Mrs,John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,0,PC,17599.0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Heikkinen,Miss. Laina,Miss,Laina,,0,STON/O,3101282.0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Futrelle,Mrs. Jacques Heath (Lily May Peel),Mrs,Jacques Heath (Lily May Peel),Lily May Peel,0,NUMBER,113803.0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Allen,Mr. William Henry,Mr,William Henry,,0,NUMBER,373450.0
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr,Moran,Mr. James,Mr,James,,0,NUMBER,330877.0
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr,McCarthy,Mr. Timothy J,Mr,Timothy J,,0,NUMBER,17463.0
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master,Palsson,Master. Gosta Leonard,Master,Gosta Leonard,,0,NUMBER,349909.0
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs,Johnson,Mrs. Oscar W (Elisabeth Vilhelmina Berg),Mrs,Oscar W (Elisabeth Vilhelmina Berg),Elisabeth Vilhelmina Berg,0,NUMBER,347742.0
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs,Nasser,Mrs. Nicholas (Adele Achem),Mrs,Nicholas (Adele Achem),Adele Achem,0,NUMBER,237736.0


In [65]:
# Deck from Cabin (first letter); many missing
df['Deck'] = df['Cabin'].astype(str).str[0]
df['Deck'] = df['Deck'].where(df['Deck'].isin(list('ABCDEFGT')), other='U')

df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title,Surname,Title_First_Middle,Title_Raw,First_Middle,MaidenName,ZeroFare,Ticket_prefix,Ticket_number,Deck
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,...,Mr,Braund,Mr. Owen Harris,Mr,Owen Harris,,0,A/,21171.0,U
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,Mrs,Cumings,Mrs. John Bradley (Florence Briggs Thayer),Mrs,John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,0,PC,17599.0,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,...,Miss,Heikkinen,Miss. Laina,Miss,Laina,,0,STON/O,3101282.0,U
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,...,Mrs,Futrelle,Mrs. Jacques Heath (Lily May Peel),Mrs,Jacques Heath (Lily May Peel),Lily May Peel,0,NUMBER,113803.0,C
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,...,Mr,Allen,Mr. William Henry,Mr,William Henry,,0,NUMBER,373450.0,U


In [66]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_First_Middle,Title_Raw,First_Middle,MaidenName,ZeroFare,Ticket_prefix,Ticket_number,Deck,FamilySize,IsAlone
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,...,Mr. Owen Harris,Mr,Owen Harris,,0,A/,21171.0,U,2,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,Mrs. John Bradley (Florence Briggs Thayer),Mrs,John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,0,PC,17599.0,C,2,0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,...,Miss. Laina,Miss,Laina,,0,STON/O,3101282.0,U,1,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,...,Mrs. Jacques Heath (Lily May Peel),Mrs,Jacques Heath (Lily May Peel),Lily May Peel,0,NUMBER,113803.0,C,2,0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,...,Mr. William Henry,Mr,William Henry,,0,NUMBER,373450.0,U,1,1


In [67]:
# Ticket group size: passengers sharing same ticket number (may indicate family/group)
ticket_counts = df['Ticket'].value_counts()
df['TicketGroupSize'] = df['Ticket'].map(ticket_counts)
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_Raw,First_Middle,MaidenName,ZeroFare,Ticket_prefix,Ticket_number,Deck,FamilySize,IsAlone,TicketGroupSize
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,...,Mr,Owen Harris,,0,A/,21171.0,U,2,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,Mrs,John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,0,PC,17599.0,C,2,0,2
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,...,Miss,Laina,,0,STON/O,3101282.0,U,1,1,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,...,Mrs,Jacques Heath (Lily May Peel),Lily May Peel,0,NUMBER,113803.0,C,2,0,2
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,...,Mr,William Henry,,0,NUMBER,373450.0,U,1,1,1


In [68]:
df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Surname',
       'Title_First_Middle', 'Title_Raw', 'First_Middle', 'MaidenName',
       'ZeroFare', 'Ticket_prefix', 'Ticket_number', 'Deck', 'FamilySize',
       'IsAlone', 'TicketGroupSize'],
      dtype='object')

In [69]:

# Combine for consistent preprocessing (except 'Survived')


# ===== 1. Fill missing 'Embarked' with mode =====
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# ===== 2. Fill missing 'Fare' in test with median by Pclass & Embarked =====
df['Fare'] = df.groupby(['Pclass', 'Embarked'])['Fare'] \
                             .apply(lambda x: x.fillna(x.median()))

# ===== 3. Fill missing 'Age' using median of Title & Pclass =====
df['Age'] = df.groupby(['Title', 'Pclass'])['Age'] \
                            .apply(lambda x: x.fillna(x.median()))

# ===== 4. Fill missing 'Deck' with 'U' (Unknown) =====
df['Deck'] = df['Deck'].fillna('U')

# ===== 5. Fill missing 'Cabin' with 'Unknown' =====
df['Cabin'] = df['Cabin'].fillna('Unknown')

# ===== 6. Fill missing 'Ticket_prefix' with 'NONE' =====
df['Ticket_prefix'] = df['Ticket_prefix'].fillna('NONE')

# ===== 7. Fill missing 'Ticket_number' with -1 =====
df['Ticket_number'] = df['Ticket_number'].fillna(-1)

# ===== 8. Fill any remaining NaN in categorical columns with 'Unknown' =====
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna('Unknown')

# ===== 9. Fill any remaining NaN in numeric columns with median =====
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# ===== 10. Split back into train/test =====
train_processed = df.iloc[:len(df), :]
test_processed = df.iloc[len(df):, :]

# Add back target variable
train_processed['Survived'] = df['Survived']

# Save cleaned files
train_processed.to_csv("train_clean.csv", index=False)
test_processed.to_csv("test_clean.csv", index=False)


TypeError: incompatible index of inserted column with frame index