In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl


from typing import Literal, Any, Union, Callable


mpl.rcParams["axes.grid"] = True
mpl.rcParams["axes.axisbelow"] = True
mpl.rcParams["grid.color"] = "gainsboro"

# Load datasets

In [None]:
PATH_TRAIN_DATASET = "../../datasets/train.csv"
PATH_TEST_DATASET = "../../datasets/test.csv"

train_df = pd.read_csv(PATH_TRAIN_DATASET, index_col=0)
test_df = pd.read_csv(PATH_TEST_DATASET, index_col=0)

# Explore train data

In [None]:
train_df.head()

From kaggle:

**pclass**: A proxy for socio-economic status (SES)  
&emsp; 1st = Upper  
&emsp; 2nd = Middle  
&emsp; 3rd = Lower

**age**: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

**sibsp**: The dataset defines family relations in this way...  
&emsp; Sibling = brother, sister, stepbrother, stepsister  
&emsp; Spouse = husband, wife (mistresses and fiancés were ignored)

**parch**: The dataset defines family relations in this way...  
&emsp; Parent = mother, father  
&emsp; Child = daughter, son, stepdaughter, stepson  
&emsp; Some children travelled only with a nanny, therefore parch=0 for them.

**embarked**: Port of Embarkation  
&emsp; C = Cherbourg  
&emsp; Q = Queenstown  
&emsp; S = Southampton

In [None]:
train_df.info()

`Age`, `Cabin` and `Embarked` columns have missing values.

`Name`, `Sex`, `Ticket`, `Cabin` and `Embarked` columns contain string values.

In [None]:
train_df.Ticket.unique().size

In [None]:
train_df.describe()

In [None]:
axs = train_df.hist(figsize=(10, 7))

In [None]:
def cmap_hist_by(
    *,
    value: np.ndarray,
    patches: mpl.container.BarContainer,
    ax: mpl.axes.Axes,
    clb_title: str,
    cm: str = "seismic_r",
):
    cmap = plt.get_cmap(cm)

    for v, p in zip(value, patches):
        plt.setp(p, "facecolor", cmap(v))

    clb = plt.colorbar(mpl.cm.ScalarMappable(cmap=cmap), ax=ax)
    clb.set_label(clb_title)

In [None]:
def get_survived_cuts_mean(
    df: pd.DataFrame, column_cutted: str, bins: int or list, retbins: bool = True
) -> np.ndarray:
    survived_cuts, survived_bins = pd.cut(
        df[column_cutted], bins=10, right=False, retbins=retbins
    )

    survived_cuts_mean = df.groupby(survived_cuts).Survived.mean().to_numpy()

    if retbins:
        return survived_cuts_mean, survived_bins
    return survived_cuts_mean

In [None]:
train_df_numeric = train_df.select_dtypes("number")
_columns = train_df_numeric.columns.drop("Survived")

fig, axs = plt.subplots(3, 2, figsize=(2 * 5, 3 * 3.1), sharey=True)
axs[2, 1].set_axis_off()

for ax, col in zip(axs.flatten(), _columns):
    survived_cuts_mean, survived_bins = get_survived_cuts_mean(train_df, col, bins=10)
    n, bins, patches = ax.hist(train_df_numeric[col], bins=survived_bins)

    cmap_hist_by(
        value=survived_cuts_mean,
        patches=patches,
        ax=ax,
        clb_title="Survived",
    )

    ax.set_ylabel("Number of people")
    ax.set_xlabel(col)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(2 * 6, 4), sharey=True)

sex_survived_mean = train_df.groupby("Sex", as_index=False).Survived.mean().to_numpy()
embarked_survived_mean = (
    train_df.groupby("Embarked", as_index=False).Survived.mean().to_numpy()
)

axs[0].bar(sex_survived_mean[:, 0], sex_survived_mean[:, 1])
axs[1].bar(embarked_survived_mean[:, 0], embarked_survived_mean[:, 1])

axs[0].set_ylabel("Survived")

for ax in axs.flatten():
    ax.set_ylim(0, 1)

# Pre-process data to feed ML model

`Cabin` has so much missing values that probably the best approach is to <font color='red'>remove</font> this column. 

`Name` holds no critical information, so the best approach is to <font color='red'>remove</font> this column.  

`Ticket` has 681 unique values among the 891, probably we won't find any correlation between  
survived ones and its tickets, so the best approach is to <font color='red'>remove</font> this column.  

`Embarked` has 2 missing values. We will fill these values with the <font color='yellow'>most frequent value</font> for  
this column.  

`Age` has almost 200 missing values. We'll fill these values with a <font color='yellow'>median of the ages</font>.

`Sex` and `Embarked` are columns containing categorical values. We need to convert these  
categories to numbers using an enconder.

The Ordinal Encoder does not seem a good idea since nor `Sex` nor `Embarked` have a relation  
of _order_ or _scale_. We will stick to the <font color="yellow">One Hot Encoder</font>.

In [None]:
COLUMNS_TO_REMOVE = ["Name", "Cabin", "Ticket"]
COLUMNS_TO_FILL_MISSING_VALUES_MODE = ["Embarked"]
COLUMNS_TO_FILL_MISSING_VALUES_MEDIAN = ["Age"]


def fillna(
    df: pd.DataFrame,
    subset: str,
    method: Union[Literal["median"], Literal["mode"]] = "median",
) -> pd.DataFrame:
    if method == "median":
        df[subset] = df[subset].fillna(df[subset].median())
    elif method == "mode":
        df[subset] = df[subset].fillna(df[subset].mode().iloc[0])

    return df


train_data = (
    train_df
    .drop(columns=COLUMNS_TO_REMOVE)
    .pipe(fillna, subset=COLUMNS_TO_FILL_MISSING_VALUES_MEDIAN, method="median")
    .pipe(fillna, subset=COLUMNS_TO_FILL_MISSING_VALUES_MODE, method="mode")
)

train_labels = train_data.Survived.to_numpy()
train_data = train_data.drop(columns=["Survived"])

In [None]:
def transform_columns(df: pd.DataFrame, column: str, transformer) -> np.ndarray:
    return transformer.fit_transform(df[column])


def concatenate_transformed_column(df: pd.DataFrame, column: np.ndarray) -> np.ndarray:
    return np.c_[df.to_numpy(), column]

In [None]:
from sklearn.preprocessing import OneHotEncoder

COLUMNS_TO_ENCODE = ["Sex", "Embarked"]

one_hot_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)

cat_one_hot_train = transform_columns(train_data, _COLUMNS_TO_ENCODE, one_hot_enc)
train_data_trans = concatenate_transformed_column(
    train_data.drop(columns=COLUMNS_TO_ENCODE), cat_one_hot_train
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(train_data_trans, train_labels)

In [None]:
test_df.info()

In [None]:
_COLUMNS_TO_FILL_MISSING_VALUES_MEDIAN = ["Age", "Fare"]

test_data = (
    test_df
    .drop(columns=COLUMNS_TO_REMOVE)
    .pipe(fillna, subset=_COLUMNS_TO_FILL_MISSING_VALUES_MEDIAN, method="median")
    .pipe(fillna, subset=COLUMNS_TO_FILL_MISSING_VALUES_MODE, method="mode")
)

cat_one_hot_test = transform_columns(test_data, _COLUMNS_TO_ENCODE, one_hot_enc)
test_data_trans = concatenate_transformed_column(
    test_data.drop(columns=COLUMNS_TO_ENCODE), cat_one_hot_test
)

test_data_trans

In [None]:
predictions = knn.predict(test_data_trans)
predictions

In [None]:
predictions_df = pd.DataFrame(
    {
        "PassengerId": test_data.index.values,
        "Survived": predictions
    }
)

predictions_df

In [None]:
predictions_df.to_csv("../../datasets/results.csv", columns=["PassengerId", "Survived"], index=False)