In [None]:
import pandas as pd

from xgboost import XGBClassifier, XGBRegressor

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score, confusion_matrix

In [None]:
df = pd.read_csv('./data/Titanic/train.csv')
df_train = pd.read_csv('./data/Titanic/test.csv')
passenger_id = df_train['PassengerId']

In [None]:
# unnecessary_columns = ['PassengerId', 'Name', 'Ticket']
unnecessary_columns = ['PassengerId', 'Ticket']
df = df.drop(columns=unnecessary_columns)
df_train = df_train.drop(columns=unnecessary_columns)

target_column = 'Survived'

X = df.drop(columns=[target_column])
y = df[target_column]

In [None]:
X['MarriedFemale'] = X['Name'].apply(lambda x: 'Mrs' in x)
X = X.drop(columns=['Name'])

df_train['MarriedFemale'] = df_train['Name'].apply(lambda x: 'Mrs' in x)
df_train = df_train.drop(columns=['Name'])

In [None]:
def extract_cabin_level(cabin):
    if type(cabin) is not str:
        return None
    level = cabin.split(' ')[0][0]
    return level


def extract_cabin_count(cabin):
    if type(cabin) is not str:
        return None
    cabins = cabin.split(' ')
    return len(cabins)

X['CabinLevel'] = X['Cabin'].apply(extract_cabin_level)
X['CabinCount'] = X['Cabin'].apply(extract_cabin_count)
df_train['CabinLevel'] = df_train['Cabin'].apply(extract_cabin_level)
df_train['CabinCount'] = df_train['Cabin'].apply(extract_cabin_count)

X = X.drop(columns=['Cabin'])
df_train = df_train.drop(columns=['Cabin'])

In [None]:
categorical_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'CabinLevel', 'CabinCount', 'Embarked', 'MarriedFemale']
# categorical_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'CabinLevel', 'CabinCount', 'Embarked']
numerical_columns = ['Age', 'Fare']

X[categorical_columns] = X[categorical_columns].astype('category')
X[numerical_columns] = X[numerical_columns].astype('float')
df_train[categorical_columns] = df_train[categorical_columns].astype('category')
df_train[numerical_columns] = df_train[numerical_columns].astype('float')

X.info()

In [None]:
X.isna().sum(axis=0) / len(X) * 100

In [None]:
X = X.drop(columns=['CabinLevel', 'CabinCount'])
df_train = df_train.drop(columns=['CabinLevel', 'CabinCount'])
categorical_columns.remove('CabinLevel')
categorical_columns.remove('CabinCount')

for column in categorical_columns:
    X[column] = X[column].fillna(X[column].mode().iloc[0])
    df_train[column] = df_train[column].fillna(df_train[column].mode().iloc[0])
for column in numerical_columns:
    X[column] = X[column].fillna(X[column].median())
    df_train[column] = df_train[column].fillna(df_train[column].median())

# # Impute Age
# column = 'Age'

# nan_mask = X[column].isna()
# X_imp_fit = X[~nan_mask].drop(columns=[column])
# y_imp_fit = X[~nan_mask][column]

# imputer = XGBRegressor(enable_categorical=True)
# imputer.fit(X_imp_fit, y_imp_fit)
# X_imp_pred = X[nan_mask].drop(columns=[column])
# y_fill = imputer.predict(X_imp_pred)

# X.loc[nan_mask, column] = y_fill

# # Impute Embarked
# column = 'Embarked'

# encoder = LabelEncoder()

# nan_mask = X[column].isna()
# X_imp_fit = X[~nan_mask].drop(columns=[column])
# y_imp_fit = encoder.fit_transform(X[~nan_mask][[column]])

# imputer = XGBClassifier(enable_categorical=True)
# imputer.fit(X_imp_fit, y_imp_fit)
# X_imp_pred = X[nan_mask].drop(columns=[column])
# y_fill = encoder.inverse_transform(imputer.predict(X_imp_pred)[:, np.newaxis])

# X.loc[nan_mask, column] = y_fill

# # Impute CabinLevel
# column = 'CabinLevel'

# encoder = LabelEncoder()

# nan_mask = X[column].isna()
# X_imp_fit = X[~nan_mask].drop(columns=[column])
# y_imp_fit = encoder.fit_transform(X[~nan_mask][[column]])

# imputer = XGBClassifier(enable_categorical=True)
# imputer.fit(X_imp_fit, y_imp_fit)
# X_imp_pred = X[nan_mask].drop(columns=[column])
# y_fill = encoder.inverse_transform(imputer.predict(X_imp_pred)[:, np.newaxis])

# X.loc[nan_mask, column] = y_fill

# # Impute CabinCount
# column = 'CabinCount'

# encoder = LabelEncoder()

# nan_mask = X[column].isna()
# X_imp_fit = X[~nan_mask].drop(columns=[column])
# y_imp_fit = encoder.fit_transform(X[~nan_mask][[column]])

# imputer = XGBClassifier(enable_categorical=True)
# imputer.fit(X_imp_fit, y_imp_fit)
# X_imp_pred = X[nan_mask].drop(columns=[column])
# y_fill = encoder.inverse_transform(imputer.predict(X_imp_pred)[:, np.newaxis])

# X.loc[nan_mask, column] = y_fill

In [None]:
X[categorical_columns] = X[categorical_columns].astype('category')
X[numerical_columns] = X[numerical_columns].astype('float')

df_train[categorical_columns] = df_train[categorical_columns].astype('category')
df_train[numerical_columns] = df_train[numerical_columns].astype('float')

In [None]:
X.isna().sum(axis=0) / len(X) * 100

In [None]:
X.info()

In [None]:
# Split dataset in train and test data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = X, X, y, y

# Train classifier and predict data 
clf = XGBClassifier(enable_categorical=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

# Print metrics
print(f'Accuracy:{accuracy}')
print(f'Presicion: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')
print(f'ROC AUC: {roc_auc}')
print('Confusion Matrix:')
print(f'{confusion_mat}')

In [None]:
y_pred = clf.predict(df_train)

In [None]:
df_submission = pd.concat([passenger_id, pd.DataFrame(y_pred, columns=['Survived'])], axis=1)

In [None]:
df_submission.to_csv('../results/Titanic/submission.csv', index=False)