In [None]:
import pandas as pd
from pathlib import Path as path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import yaml

def get_config(config_file_path:path=path('../config.yml')):# -> dict:
    with open(config_file_path) as config_file:
        config = yaml.safe_load(config_file)
    return config

settings = get_config()
dataset_path = settings['paths']['dataset_path']
print(dataset_path)

In [None]:
sns.set_style('darkgrid') # darkgrid, white grid, dark, white and ticks

plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=13)    # fontsize of the tick labels
plt.rc('ytick', labelsize=13)    # fontsize of the tick labels
plt.rc('legend', fontsize=13)    # legend fontsize
plt.rc('font', size=13)          # controls default text sizes

In [None]:
train_data_path = path(dataset_path,'Titanic/train.csv')
train_df = pd.read_csv(train_data_path)
train_df.describe()

In [None]:
train_df.info()

In [None]:
train_df.hist(bins=50, figsize=(20,15))

In [None]:
heatmap_data = train_df.corr()

print(heatmap_data['Survived'])

names = heatmap_data.columns.to_list()

fig,ax = plt.subplots(figsize=(12,10))
ax = sns.heatmap(heatmap_data,cbar_kws={"orientation": "vertical"},cmap='viridis')
plt.xticks(rotation=45)
plt.yticks(rotation='horizontal')

In [None]:
from pandas.plotting import scatter_matrix

numerical_attrs = (train_df.select_dtypes(exclude=['object','bool'])
                            .columns.to_list())

scatter_matrix(train_df[numerical_attrs], figsize=(15,12), diagonal='kde')
plt.suptitle('Scatter Matrix of Numerical Features')

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
encoded_df = ordinal_encoder.fit_transform(train_df[['Sex', 'Survived']])
encoded_df = pd.DataFrame(encoded_df)

ordinal_encoded_df = (pd.concat([train_df,encoded_df], axis=1)
                    .drop(['Sex', 'Survived'],axis=1)
                    .rename(columns={0:'Sex_enc',1:'Survived_enc'})
                    .astype({'Sex_enc':int,'Survived_enc':int}))

In [None]:
ordinal_encoded_df.head()

In [None]:
ordinal_encoded_df.hist(bins=50, figsize=(20,15))

In [None]:
fig,ax = plt.subplots(figsize=(12,10))

ax = sns.histplot(ordinal_encoded_df['Pclass'], stat='count', discrete=True, palette='deep')

ax.set(title='Passengers per Class',
        xticks = np.arange(1,4,1),
        xticklabels = [f'Pclass {x}' for x in range(1,4,1)],
        xlabel = None,
        xlim = (0,4),
        ylabel = 'Passengers',
        yticks = (np.arange(0,500,50))
        )

plt.show()

In [None]:
fig,axs = plt.subplots(figsize=(24,10), nrows=2, ncols=3, sharey=True, tight_layout=True, dpi=750)

for idx_row,row in enumerate(axs):
    for idx_column,col in enumerate(row):
        if idx_row==0:
            sns.histplot(data=ordinal_encoded_df[train_df['Pclass']==idx_column+1], x='Survived_enc',
                            stat='count', discrete=True, palette='deep',ax=axs[idx_row,idx_column],
                            hue='Survived_enc', alpha=1)
            axs[idx_row,idx_column].set(title=f'Class {idx_column+1} Survived vs Deceased',
                            xticks = np.arange(0,2,1),
                            ylabel = 'Passenger count',
                            )
        if idx_row==1:
            sns.histplot(data=ordinal_encoded_df[train_df['Pclass']==idx_column+1], x='Survived_enc',
                            stat='count', discrete=True, palette='deep',ax=axs[idx_row,idx_column],
                            hue='Sex_enc', multiple='dodge',shrink=0.7,alpha=1)
            axs[idx_row,idx_column].set(title=f'Class {idx_column+1} Survived vs Deceased',
                            xticks = np.arange(0,2,1),
                            ylabel = 'Passenger count',
                            )

filename = "PClass_Graphs.png"
overwrite = False

if path(filename).exists() and overwrite:
    fig.savefig(path("PClass_Graphs.png"))
plt.show()