In [40]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Load the df_trainset
df_train = pd.read_csv('../data/raw/train.csv')
df_test = pd.read_csv('../data/raw/test.csv')


# Display the first few rows
df_train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
import re
# Title extraction from Name (e.g., 'Mr', 'Mrs', 'Miss', etc.)
def extract_title(name: str) -> str:
    m = re.search(r',\s*([^\.]+)\.', str(name))
    return m.group(1).strip() if m else 'Unknown'

df_train['Title'] = df_train['Name'].map(extract_title)
title_map = {
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
    'Lady': 'Royal', 'Countess': 'Royal', 'Dona': 'Royal', 'Sir': 'Royal', 'Don': 'Royal',
    'Jonkheer': 'Rare', 'Capt': 'Rare', 'Col': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare'
}
df_train['Title'] = df_train['Title'].replace(title_map)

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [42]:
df_train['Surname'] = df_train['Name'].str.split(',').str[0]
df_train['Title_First_Middle'] = df_train['Name'].str.split(',').str[1].str.strip()
df_train['Title'] = df_train['Title_First_Middle'].str.split(' ').str[0]
df_train['First_Middle'] = df_train['Title_First_Middle'].str.split(' ').str[1:].str.join(' ')
df_train['First_Middle'] = df_train['First_Middle'].replace('', pd.NA)  # replace empty strings with NaN
df_train['First_Middle'] = df_train['First_Middle'].fillna('Unknown')  # fill NaN with 'Unknown'
df_train['Surname'] = df_train['Surname'].str.strip()  # remove leading and trailing spaces
df_train['Title'] = df_train['Title'].str.strip()  # remove leading and trailing spaces
df_train['First_Middle'] = df_train['First_Middle'].str.strip()  # remove leading and
df_train['MaidenName'] = df_train['Name'].str.extract(r'\((.*?)\)')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Title_First_Middle,First_Middle,MaidenName
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,Braund,Mr. Owen Harris,Owen Harris,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,Cumings,Mrs. John Bradley (Florence Briggs Thayer),John Bradley (Florence Briggs Thayer),Florence Briggs Thayer
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,Heikkinen,Miss. Laina,Laina,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,Futrelle,Mrs. Jacques Heath (Lily May Peel),Jacques Heath (Lily May Peel),Lily May Peel
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,Allen,Mr. William Henry,William Henry,


In [43]:
# remove Fare = 0, as it may not be useful for analysis
df_train = df_train[df_train['Fare'] != 0].copy()
df_train[df_train['Fare'] == 0].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Title_First_Middle,First_Middle,MaidenName


In [44]:
# Extract Ticket prefix
df_train['Ticket_prefix'] = (
    df_train['Ticket']
    .astype(str)
    .str.replace(r'\d+', '', regex=True)   # remove digits
    .str.replace('.', '', regex=False)     # remove dots
    .str.strip()                           # trim spaces
)

# Replace empty prefixes with 'NUMBER'
df_train['Ticket_prefix'] = df_train['Ticket_prefix'].replace('', 'NUMBER')

# Extract numeric part of the Ticket
df_train['Ticket_number'] = (
    df_train['Ticket']
    .astype(str)
    .str.extract(r'(\d+)$')[0]             # extract last group of digits
    .astype(float)                         # convert to numeric
)

# Optional: check results
df_train.head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Title_First_Middle,First_Middle,MaidenName,Ticket_prefix,Ticket_number
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,Braund,Mr. Owen Harris,Owen Harris,,A/,21171.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,Cumings,Mrs. John Bradley (Florence Briggs Thayer),John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,PC,17599.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,Heikkinen,Miss. Laina,Laina,,STON/O,3101282.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,Futrelle,Mrs. Jacques Heath (Lily May Peel),Jacques Heath (Lily May Peel),Lily May Peel,NUMBER,113803.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,Allen,Mr. William Henry,William Henry,,NUMBER,373450.0
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr.,Moran,Mr. James,James,,NUMBER,330877.0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr.,McCarthy,Mr. Timothy J,Timothy J,,NUMBER,17463.0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master.,Palsson,Master. Gosta Leonard,Gosta Leonard,,NUMBER,349909.0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs.,Johnson,Mrs. Oscar W (Elisabeth Vilhelmina Berg),Oscar W (Elisabeth Vilhelmina Berg),Elisabeth Vilhelmina Berg,NUMBER,347742.0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs.,Nasser,Mrs. Nicholas (Adele Achem),Nicholas (Adele Achem),Adele Achem,NUMBER,237736.0


In [45]:
# Deck from Cabin (first letter); many missing
df_train['Deck'] = df_train['Cabin'].astype(str).str[0]
df_train['Deck'] = df_train['Deck'].where(df_train['Deck'].isin(list('ABCDEFGT')), other='U')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Surname,Title_First_Middle,First_Middle,MaidenName,Ticket_prefix,Ticket_number,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,Braund,Mr. Owen Harris,Owen Harris,,A/,21171.0,U
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,Cumings,Mrs. John Bradley (Florence Briggs Thayer),John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,PC,17599.0,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,Heikkinen,Miss. Laina,Laina,,STON/O,3101282.0,U
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,Futrelle,Mrs. Jacques Heath (Lily May Peel),Jacques Heath (Lily May Peel),Lily May Peel,NUMBER,113803.0,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,Allen,Mr. William Henry,William Henry,,NUMBER,373450.0,U


In [46]:
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_train['IsAlone'] = (df_train['FamilySize'] == 1).astype(int)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Title,Surname,Title_First_Middle,First_Middle,MaidenName,Ticket_prefix,Ticket_number,Deck,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,Mr.,Braund,Mr. Owen Harris,Owen Harris,,A/,21171.0,U,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,Mrs.,Cumings,Mrs. John Bradley (Florence Briggs Thayer),John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,PC,17599.0,C,2,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,Miss.,Heikkinen,Miss. Laina,Laina,,STON/O,3101282.0,U,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,Mrs.,Futrelle,Mrs. Jacques Heath (Lily May Peel),Jacques Heath (Lily May Peel),Lily May Peel,NUMBER,113803.0,C,2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,Mr.,Allen,Mr. William Henry,William Henry,,NUMBER,373450.0,U,1,1


In [47]:
# Ticket group size: passengers sharing same ticket number (may indicate family/group)
ticket_counts = df_train['Ticket'].value_counts()
df_train['TicketGroupSize'] = df_train['Ticket'].map(ticket_counts)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Surname,Title_First_Middle,First_Middle,MaidenName,Ticket_prefix,Ticket_number,Deck,FamilySize,IsAlone,TicketGroupSize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,Braund,Mr. Owen Harris,Owen Harris,,A/,21171.0,U,2,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,Cumings,Mrs. John Bradley (Florence Briggs Thayer),John Bradley (Florence Briggs Thayer),Florence Briggs Thayer,PC,17599.0,C,2,0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,Heikkinen,Miss. Laina,Laina,,STON/O,3101282.0,U,1,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,Futrelle,Mrs. Jacques Heath (Lily May Peel),Jacques Heath (Lily May Peel),Lily May Peel,NUMBER,113803.0,C,2,0,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,Allen,Mr. William Henry,William Henry,,NUMBER,373450.0,U,1,1,1


In [48]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Surname',
       'Title_First_Middle', 'First_Middle', 'MaidenName', 'Ticket_prefix',
       'Ticket_number', 'Deck', 'FamilySize', 'IsAlone', 'TicketGroupSize'],
      dtype='object')

In [49]:

# Combine for consistent preprocessing (except 'Survived')
full_data = pd.concat([df_train.drop('Survived', axis=1), df_test], axis=0, ignore_index=True)

# ===== 1. Fill missing 'Embarked' with mode =====
full_data['Embarked'] = full_data['Embarked'].fillna(full_data['Embarked'].mode()[0])

# ===== 2. Fill missing 'Fare' in test with median by Pclass & Embarked =====
full_data['Fare'] = full_data.groupby(['Pclass', 'Embarked'])['Fare'] \
                             .apply(lambda x: x.fillna(x.median()))

# ===== 3. Fill missing 'Age' using median of Title & Pclass =====
full_data['Age'] = full_data.groupby(['Title', 'Pclass'])['Age'] \
                            .apply(lambda x: x.fillna(x.median()))

# ===== 4. Fill missing 'Deck' with 'U' (Unknown) =====
full_data['Deck'] = full_data['Deck'].fillna('U')

# ===== 5. Fill missing 'Cabin' with 'Unknown' =====
full_data['Cabin'] = full_data['Cabin'].fillna('Unknown')

# ===== 6. Fill missing 'Ticket_prefix' with 'NONE' =====
full_data['Ticket_prefix'] = full_data['Ticket_prefix'].fillna('NONE')

# ===== 7. Fill missing 'Ticket_number' with -1 =====
full_data['Ticket_number'] = full_data['Ticket_number'].fillna(-1)

# ===== 8. Fill any remaining NaN in categorical columns with 'Unknown' =====
cat_cols = full_data.select_dtypes(include='object').columns
full_data[cat_cols] = full_data[cat_cols].fillna('Unknown')

# ===== 9. Fill any remaining NaN in numeric columns with median =====
num_cols = full_data.select_dtypes(include=[np.number]).columns
full_data[num_cols] = full_data[num_cols].fillna(full_data[num_cols].median())

# ===== 10. Split back into train/test =====
train_processed = full_data.iloc[:len(df_train), :]
test_processed = full_data.iloc[len(df_train):, :]

# Add back target variable
train_processed['Survived'] = df_train['Survived']

# Save cleaned files
train_processed.to_csv("train_clean.csv", index=False)
test_processed.to_csv("test_clean.csv", index=False)


TypeError: incompatible index of inserted column with frame index