# Imports

In [1]:
import numpy as np
import pandas as pd
import os
import sys

import seaborn as sns
import matplotlib.pyplot as plt

# Add the parent folder to sys.path
sys.path.append(os.path.abspath(os.path.join('..')))
import src.data_processing as dpr
import src.feature_engineering as fen

# Load Data

In [2]:
train_data = pd.read_csv("../data/raw/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data = pd.read_csv("../data/raw/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
train_data.shape, test_data.shape

((891, 12), (418, 11))

In [5]:
# Create copies
titanic_train = train_data.copy()
titanic_test = test_data.copy()

# Data Exploration

### Missing Values and Data Types

In [None]:
dpr.describe_dataframe(titanic_train)

Unnamed: 0,Missing Count,Missing Percentage,Zero Count,Zero Percentage,Distinct Values,Mode,Data Type,Feature Type
PassengerId,0,0.0,0,0.0,891,1,int64,Numerical
Survived,0,0.0,549,0.616162,2,0,int64,Categorical
Pclass,0,0.0,0,0.0,3,3,int64,Categorical
Name,0,0.0,0,0.0,891,"Abbing, Mr. Anthony",object,Categorical
Sex,0,0.0,0,0.0,2,male,object,Categorical
Age,177,0.198653,0,0.0,88,24.0,float64,Numerical
SibSp,0,0.0,608,0.682379,7,0,int64,Categorical
Parch,0,0.0,678,0.760943,7,0,int64,Categorical
Ticket,0,0.0,0,0.0,681,1601,object,Categorical
Fare,0,0.0,15,0.016835,248,8.05,float64,Numerical


: 

In [None]:
dpr.plot_missing_values(titanic_train, show_zeros=True)

In [None]:
dpr.describe_dataframe(titanic_test)

In [None]:
dpr.plot_missing_values(titanic_test, show_zeros=True)

### Target Distribution

In [None]:
dpr.plot_binary_target_with_stats(titanic_train["Survived"], target_col="Survived")


### Feature Distributions

In [None]:
dpr.show_feature_distributions(
    titanic_train,
    titanic_train.columns,
    n_cols=3,
    cat_max_unique=10,
    exclude_cols=["PassengerId", "Survived"]
)

### Survival Rate per Feature

Categorical x Target

In [None]:
dpr.plot_survival_rate_categorical(
    titanic_train,
    titanic_train["Survived"],
    exclude_cols=["Survived", "PassengerId", "Name", "Ticket", "Cabin"]
)

Numerical x Target

Age

In [None]:
# Boxplot of Age by Survived
sns.boxplot(x='Survived', y='Age', data=titanic_train)
plt.title("Boxplot of Age grouped by Survival")
plt.show()

# Print median Age for each Survival group
print("Median Age by Survival:")
print(titanic_train.groupby("Survived")["Age"].median())

Fare

In [None]:
# Boxplot of Age by Survived
sns.boxplot(x='Survived', y='Fare', data=titanic_train)
plt.title("Boxplot of Fare grouped by Survival")
plt.show()

# Print median Age for each Survival group
print("Median Fare by Survival:")
print(titanic_train.groupby("Survived")["Fare"].median())

### Individual Feature Analysis + Feature Creation

##### Name

Extract Title from Name

In [None]:
titanic_train = fen.extract_title(titanic_train)
dpr.plot_cat_feature_survival(titanic_train, target="Survived", bins_col="Title")

##### Age

In [None]:
dpr.plot_hist_count_and_percent(
    titanic_train,
    x="Age"
)

Outlier Detection

In [None]:
# Detect Outliers per Group
outliers = dpr.detect_outliers(titanic_train, col="Age", group_cols="Survived", show=False)
outliers

Age binning

##### SibSp + Parch

New Feature - Family Size

In [None]:
titanic_train = fen.get_famtype(titanic_train)
dpr.plot_cat_feature_survival(titanic_train, target="Survived", bins_col="FamSize")

New Feature - Family Type

In [None]:
dpr.plot_cat_feature_survival(titanic_train, target="Survived", bins_col="FamType")

##### Ticket

Count Ticket Frequency

In [None]:
titanic_train.Ticket.value_counts()

##### Fare

In [None]:
dpr.plot_hist_count_and_percent(
    titanic_train[titanic_train.Fare>0],
    x="Fare"
)

Divide fare into bins:
- 0
- 0<50
- 50<100
- 100<200
- 200<300
- 300+

In [None]:
titanic_train = fen.bin_fare(titanic_train)  # create FareBinned column
dpr.plot_cat_feature_survival(titanic_train)


log transform?

Zero Values

In [None]:
titanic_train[(titanic_train.Fare==0)].iloc[:30]

In [None]:
print("Mean survival chance for passengers with fare=0:", round(titanic_train[titanic_train.Fare==0].Survived.mean(), 2))

Outlier Detection

In [None]:
# Boxplot of Age by Survived
sns.boxplot(x='Survived', y='Fare', data=titanic_train)
plt.title("Boxplot of Fare grouped by Survival")
plt.show()

# Print median Age for each Survival group
print("Median Fare by Survival:")
print(titanic_train.groupby("Survived")["Fare"].median())

# Detect Outliers per Group
outliers = dpr.detect_outliers(titanic_train, col="Fare", group_cols="Survived", show=False)

In [None]:
print(outliers.to_string())

In [None]:
# FacetGrid: separate by Sex
g = sns.displot(
    data=titanic_train,
    x="Fare",
    hue="Survived",
    col="Sex",             # separate plots per Sex
    multiple="stack",      # stack survived/died within each bin
    bins=50,
    palette={0:"red", 1:"green"},
    height=5,
    aspect=1
)

g.set_axis_labels("Fare", "Count")
g.set_titles("Sex: {col_name}")
g._legend.set_title("Survived")
plt.show()


##### Cabin

Extract Deck

In [None]:
titanic_train = fen.extract_deck(titanic_train)
dpr.plot_cat_feature_survival(titanic_train, target="Survived", bins_col="Deck")

In [None]:
titanic_train = fen.extract_num_cabins(titanic_train)
titanic_train["NumCabins"].value_counts()

In [None]:
dpr.plot_cat_feature_survival(titanic_train, target="Survived", bins_col="NumCabins")

### Cross Feature Analysis

In [None]:
print(titanic_train.groupby(["Embarked", "Pclass"])["Fare"].count())


In [None]:
sns.catplot(
    x="Embarked",
    hue="Survived",
    col="Sex",
    data=titanic_train,
    kind="count",
    height=5,
    aspect=1
)
plt.show()


In [None]:
ct = pd.crosstab(
    index=titanic_train["Sex"],
    columns=titanic_train["Embarked"],
    values=titanic_train["Survived"],
    aggfunc="mean"
)
sns.heatmap(ct, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Survival rate by Sex and Embarked")
plt.show()

pd.crosstab(
    index=titanic_train["Sex"],        # rows
    columns=titanic_train["Embarked"], # columns
    values=titanic_train["Survived"],  # optional: numeric summary
    aggfunc="count"                     # gives survival rate
)

In [None]:
ct = pd.crosstab(
    index=titanic_train["Sex"],
    columns=titanic_train["Pclass"],
    values=titanic_train["Survived"],
    aggfunc="mean"
)
sns.heatmap(ct, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Survival rate by Sex and Pclass")
plt.show()

pd.crosstab(
    index=titanic_train["Sex"],        # rows
    columns=titanic_train["Pclass"], # columns
    values=titanic_train["Survived"],  # optional: numeric summary
    aggfunc="count"                     # gives survival rate
)


In [None]:
ct = pd.crosstab(
    index=titanic_train["Sex"],
    columns=titanic_train["FareBinned"],
    values=titanic_train["Survived"],
    aggfunc="mean"
)
sns.heatmap(ct, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Survival rate by Sex and FareBinned")
plt.show()

pd.crosstab(
    index=titanic_train["Sex"],        # rows
    columns=titanic_train["FareBinned"], # columns
    values=titanic_train["Survived"],  # optional: numeric summary
    aggfunc="count"                     # gives survival rate
)


In [None]:
ct = pd.crosstab(
    index=titanic_train["Sex"],
    columns=titanic_train["FamType"],
    values=titanic_train["Survived"],
    aggfunc="mean"
)
sns.heatmap(ct, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Survival rate by Sex and FamType")
plt.show()

pd.crosstab(
    index=titanic_train["Sex"],        # rows
    columns=titanic_train["FamType"], # columns
    values=titanic_train["Survived"],  # optional: numeric summary
    aggfunc="count"                     # gives survival rate
)


In [None]:
ct = pd.crosstab(
    index=titanic_train["Sex"],
    columns=titanic_train["Title"],
    values=titanic_train["Survived"],
    aggfunc="mean"
)
sns.heatmap(ct, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Survival rate by Sex and Title")
plt.show()

pd.crosstab(
    index=titanic_train["Sex"],        # rows
    columns=titanic_train["Title"], # columns
    values=titanic_train["Survived"],  # optional: numeric summary
    aggfunc="count"                     # gives survival rate
)


In [None]:
ct = pd.crosstab(
    index=titanic_train["Sex"],
    columns=titanic_train["Deck"],
    values=titanic_train["Survived"],
    aggfunc="mean"
)
sns.heatmap(ct, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Survival rate by Sex and Deck")
plt.show()

pd.crosstab(
    index=titanic_train["Sex"],        # rows
    columns=titanic_train["Deck"], # columns
    values=titanic_train["Survived"],  # optional: numeric summary
    aggfunc="count"                     # gives survival rate
)


In [None]:
ct = pd.crosstab(
    index=titanic_train["Sex"],
    columns=titanic_train["NumCabins"],
    values=titanic_train["Survived"],
    aggfunc="mean"
)
sns.heatmap(ct, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Survival rate by Sex and NumCabins")
plt.show()

pd.crosstab(
    index=titanic_train["Sex"],        # rows
    columns=titanic_train["NumCabins"], # columns
    values=titanic_train["Survived"],  # optional: numeric summary
    aggfunc="count"                     # gives survival rate
)

### Age + Sex

Outlier Detection by Age

In [None]:
# Detect Outliers per Survived/Sex group
outliers = dpr.detect_outliers(titanic_train, col="Age", group_cols=["Survived", "Sex"], show=True)

Boxplot

In [None]:
# Boxplot of Age by Survived, separated by Sex
plt.figure(figsize=(8,6))
sns.boxplot(x='Survived', y='Age', hue='Sex', data=titanic_train, showfliers=True)
plt.title("Boxplot of Age by Survival and Sex")
plt.ylabel("Age")
plt.xlabel("Survived")
plt.legend(title="Sex")
plt.show()

# Median Age by Survived and Sex
print("Median Age by Survival and Sex:")
print(titanic_train.groupby(["Survived", "Sex"])["Age"].median())

Distribution

In [None]:
# Age in counts
dpr.plot_feature_survival_hist(titanic_train, feature="Age", percent=False)
# Age in percent
dpr.plot_feature_survival_hist(titanic_train, feature="Age", percent=True)

Female passengers show consistently higher survival proportions across nearly all age groups, while male survival is strongly concentrated among children.

### Fare & Sex

Outlier Detection by Age

In [None]:
# Detect Outliers per Survived/Sex group
outliers = dpr.detect_outliers(titanic_train, col="Fare", group_cols=["Survived", "Sex"], show=True)

Boxplot

In [None]:
# Boxplot of Age by Survived, separated by Sex
plt.figure(figsize=(8,6))
sns.boxplot(x='Survived', y='Fare', hue='Sex', data=titanic_train, showfliers=True)
plt.title("Boxplot of Age by Survival and Sex")
plt.ylabel("Fare")
plt.xlabel("Survived")
plt.legend(title="Sex")
plt.show()

# Median Age by Survived and Sex
print("Median Fare by Survival and Sex:")
print(titanic_train.groupby(["Survived", "Sex"])["Fare"].median())

Distribution

In [None]:
# Fare in counts
dpr.plot_feature_survival_hist(titanic_train, feature="Fare", percent=False)
# Fare in percent
dpr.plot_feature_survival_hist(titanic_train, feature="Fare", percent=True)

Fare > 500

In [None]:
titanic_train[titanic_train.Fare >500]

200 < Fare < 500

In [None]:
titanic_train[(titanic_train.Fare > 200) & (titanic_train.Fare < 500) & (titanic_train.Sex=="male")]

In [None]:
titanic_train[(titanic_train.Fare > 200) & (titanic_train.Fare < 500) & (titanic_train.Sex=="female")]

### Fare & Embarked

In [None]:
sns.boxplot(x="Embarked", y="Fare", hue="Sex", data=titanic_train)
plt.title("Fare distribution by Embarked and Survival")
plt.show()

# Median Age by Survived and Sex
print("Median Fare by Survival and Sex:")
print(titanic_train.groupby(["Embarked", "Sex"])["Fare"].median())
print(titanic_train.groupby(["Embarked", "Sex", "Pclass"])["Fare"].count())


In [None]:
g = sns.displot(
    data=titanic_train,
    x="Fare",
    hue="Survived",
    col="Embarked",
    multiple="fill",
    bins=20,
    palette={0:"red", 1:"green"}
)
g.set_axis_labels("Fare", "Proportion")
g.set_titles("Port: {col_name}")
plt.show()
