In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

sns.set_theme()

## Unzip titanic dataset

In [None]:
import zipfile
import os 

current_dir = os.getcwd()
path_to_zip_file = os.path.join(current_dir, "titanic.zip")
dataset_path = os.path.join(current_dir, "dataset")

os.makedirs(dataset_path, exist_ok=True)

with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(dataset_path)

## Analyzing the dataset

### Step 1 — Read datasets

In [None]:
dataset = pd.read_csv(os.path.join(dataset_path, "train.csv"))
dataset

### Step 2 — Analyze datasets

In [None]:
print(dataset.info())

In [None]:
dataset.isnull().sum()

In [None]:
dataset["Survived"].value_counts()

In [None]:
dataset.describe()

In [None]:
sns.pairplot(data=dataset.drop(["PassengerId"], axis=1), 
             hue="Survived",)
plt.show()

In [None]:
sns.pairplot(data=dataset.drop(["PassengerId"], axis=1), 
             y_vars="Survived")
plt.show()

In [None]:
sns.displot(
    data=dataset,
    x="Sex",
    hue="Survived",
    multiple="fill", 
    height=5,
    aspect=1.2
)

In [None]:
plt.title("Number of Survived vs Not Survived based on Gender")
sns.countplot(data=dataset, x="Sex", hue="Survived")
plt.show()

### Step 3 — Clean and Prepare dataset for training

In [None]:
try:
    dataset.drop(["Cabin"], axis=1, inplace=True)
    
except:
    print("Already removed cabin column") 

print(dataset.columns)
dataset.info()

In [None]:
sns.displot(
    data=dataset,
    x="Age",
    hue="Survived",
    multiple="fill", 
    height=5,
    aspect=1.2
)

dataset.drop(["Age", "PassengerId"], axis=1, inplace=True)
print(dataset.columns)
dataset.info()

In [None]:
dataset.drop(["Ticket"], axis=1, inplace=True)

In [None]:
sns.displot(
    data=dataset,
    x="Embarked",
    hue="Survived",
    multiple="fill", 
    height=5,
    aspect=1.2
)
dataset.info()

In [None]:
dataset.drop(["Embarked"], axis=1, inplace=True)

In [None]:
dataset.drop(["Name"], axis=1, inplace=True)
dataset["Sex"] = dataset["Sex"].map({"male": 0, "female": 1})
dataset.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
dataset_normalized = scaler.fit(dataset.drop(["Survived"], axis=1)).transform(dataset.drop(["Survived"], axis=1))
dataset_normalized_df = pd.DataFrame(data=dataset_normalized, columns=dataset.drop(["Survived"], axis=1).columns)
dataset_normalized_df["Survived"] = dataset["Survived"]
dataset_normalized_df.head()

In [None]:
sns.pairplot(dataset_normalized_df, 
             hue="Survived")

In [None]:
dataset_normalized_df.describe()

In [None]:
# Save Processed dataset
dataset_normalized_df.to_csv(os.path.join(dataset_path, "processed_dataset.csv"))

import pickle

with open("model/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)