In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

# Load datasets

In [None]:
PATH_TRAIN_DATASET = "../datasets/train.csv"
PATH_TEST_DATASET = "../datasets/test.csv"

train_df = pd.read_csv(PATH_TRAIN_DATASET, index_col=0)
test_df = pd.read_csv(PATH_TEST_DATASET, index_col=0)

# Explore train data

In [None]:
train_df.head()

From kaggle:

**pclass**: A proxy for socio-economic status (SES)  
&emsp; 1st = Upper  
&emsp; 2nd = Middle  
&emsp; 3rd = Lower

**age**: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

**sibsp**: The dataset defines family relations in this way...  
&emsp; Sibling = brother, sister, stepbrother, stepsister  
&emsp; Spouse = husband, wife (mistresses and fiancés were ignored)

**parch**: The dataset defines family relations in this way...  
&emsp; Parent = mother, father  
&emsp; Child = daughter, son, stepdaughter, stepson  
&emsp; Some children travelled only with a nanny, therefore parch=0 for them.

**embarked**: Port of Embarkation  
&emsp; C = Cherbourg  
&emsp; Q = Queenstown  
&emsp; S = Southampton

In [None]:
train_df.info()

`Age`, `Cabin` and `Embarked` columns have missing values.

`Name`, `Sex`, `Ticket`, `Cabin` and `Embarked` columns contain string values.

In [None]:
train_df.Ticket.unique().size

In [None]:
train_df.describe()

In [None]:
axes = train_df.hist(figsize=(10, 7))

for ax in axes.flatten():
    ax.grid(color="gainsboro")
    ax.set_axisbelow(True)

In [None]:
def cmap_hist_by(
    *,
    value: np.ndarray,
    patches: mpl.container.BarContainer,
    ax: mpl.axes.Axes,
    clb_title: str,
    cm: str = "seismic_r",
):
    cmap = plt.get_cmap(cm)

    for v, p in zip(value, patches):
        plt.setp(p, "facecolor", cmap(v))

    clb = plt.colorbar(mpl.cm.ScalarMappable(cmap=cmap), ax=ax)
    clb.set_label(clb_title)

In [None]:
def get_survived_cuts_mean(
    df: pd.DataFrame, column_cutted: str, bins: int or list, retbins: bool = True
) -> np.ndarray:
    survived_cuts, survived_bins = pd.cut(
        df[column_cutted], bins=10, right=False, retbins=retbins
    )

    survived_cuts_mean = df.groupby(survived_cuts).Survived.mean().to_numpy()

    if retbins:
        return survived_cuts_mean, survived_bins
    return survived_cuts_mean

In [None]:
train_df_numeric = train_df.select_dtypes("number")
_columns = train_df_numeric.columns.drop("Survived")

fig, axs = plt.subplots(3, 2, figsize=(2 * 5, 3 * 3.1), sharey=True)
axs[2, 1].set_axis_off()

for ax, col in zip(axs.flatten(), _columns):
    survived_cuts_mean, survived_bins = get_survived_cuts_mean(train_df, col, bins=10)
    n, bins, patches = ax.hist(train_df_numeric[col], bins=survived_bins)

    cmap_hist_by(
        value=survived_cuts_mean,
        patches=patches,
        ax=ax,
        clb_title="Survived",
    )

    ax.set_ylabel("Number of people")
    ax.set_xlabel(col)

In [None]:
male = train_df.set_index("Sex").loc["male"]
female = train_df.set_index("Sex").loc["female"]

In [None]:
train_df.Embarked.dropna().unique()

In [None]:
train_df.groupby("Sex", as_index=False).Survived.mean().to_numpy()[:, 1]

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(2 * 6, 4), sharey=True)

sex_survived_mean = train_df.groupby("Sex", as_index=False).Survived.mean().to_numpy()
embarked_survived_mean = (
    train_df.groupby("Embarked", as_index=False).Survived.mean().to_numpy()
)

axs[0].bar(sex_survived_mean[:, 0], sex_survived_mean[:, 1])
axs[1].bar(embarked_survived_mean[:, 0], embarked_survived_mean[:, 1])

axs[0].set_ylabel("Survived")

for ax in axs.flatten():
    ax.set_ylim(0, 1)
    ax.grid(color="gainsboro")
    ax.set_axisbelow(True)

# Pre-process data to feed ML model

`Cabin` has so much missing values that probably the best approach is to <font color='red'>remove</font> this column.  
`Name` holds no critical information, so the best approach is to <font color='red'>remove</font> this column.  
`Ticket` has 681 unique values among the 891, probably we won't find any correlation between  
survived ones and its tickets, so the best approach is to <font color='red'>remove</font> this column. 

In [None]:
train_df = train_df.drop(columns=["Name", "Cabin", "Ticket"])
test_df = test_df.drop(columns=["Name", "Cabin", "Ticket"])

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder()

sex_ord_encoder = ord_encoder.fit_transform(
    train_df.Sex.to_numpy().reshape(-1, 1)
).flatten()

embarked_ord_encoder = ord_encoder.fit_transform(
    train_df.Embarked.to_numpy().reshape(-1, 1)
).flatten()

In [None]:
embarked_ord_encoder