In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.impute import KNNImputer

import itertools

from tqdm import tqdm

data_dir="/kaggle/input/titanic"

do_grid_search = False

# Load data

In [26]:
df_train = pd.read_csv(f"{data_dir}/train.csv")
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42)
df_test = pd.read_csv(f"{data_dir}/test.csv")

display(df_train)
display(df_val)
display(df_test)

# Basic train data info

In [27]:
df_train.info()

In [28]:
df_train.describe()

# Age

In [29]:
sns.displot(df_train, x="Age", kind="kde")

In [30]:
df_train.groupby(['Pclass','Sex'])['Age'].median().to_frame()

In [31]:
display(df_train[(df_train["Pclass"] == 1) & (df_train["Sex"] == "female") & (df_train["Age"].isnull())])
display(df_train[(df_train["Pclass"] == 2) & (df_train["Sex"] == "male") & (df_train["Age"].isnull())])

In [32]:
def impute_age(df):
    df.loc[(df["Pclass"] == 1) & (df["Sex"] == "female") & (df["Age"].isnull()), "Age"] = 35
    df.loc[(df["Pclass"] == 1) & (df["Sex"] == "male") & (df["Age"].isnull()), "Age"] = 41
    df.loc[(df["Pclass"] == 2) & (df["Sex"] == "female") & (df["Age"].isnull()), "Age"] = 28
    df.loc[(df["Pclass"] == 2) & (df["Sex"] == "male") & (df["Age"].isnull()), "Age"] = 30
    df.loc[(df["Pclass"] == 3) & (df["Sex"] == "female") & (df["Age"].isnull()), "Age"] = 22
    df.loc[(df["Pclass"] == 3) & (df["Sex"] == "male") & (df["Age"].isnull()), "Age"] = 26
    
    return df

df_train = impute_age(df_train)
df_val = impute_age(df_val)
df_test = impute_age(df_test)

In [33]:
df_train.loc[[256, 166, 2, 1, 55]]

## Discretize Age

In [34]:
age_bins = [0, 3, 13, 19, 29, 45, 60, np.inf]

df_train["Age"] = pd.cut(df_train["Age"], bins=age_bins, labels=False)
df_val["Age"] = pd.cut(df_val["Age"], bins=age_bins, labels=False)
df_test["Age"] = pd.cut(df_test["Age"], bins=age_bins, labels=False)

# Fare

In [35]:
fare_bins = pd.qcut(df_train["Fare"], 5, retbins=True)[1]
fare_bins[0] = -np.inf
fare_bins[-1] = np.inf
fare_bins

In [36]:
df_train["Fare"] = pd.cut(df_train["Fare"], bins=fare_bins, labels=False)
df_val["Fare"] = pd.cut(df_val["Fare"], bins=fare_bins, labels=False)
df_test["Fare"] = pd.cut(df_test["Fare"], bins=fare_bins, labels=False)

# Sex Encoding

In [37]:
df_train["Sex"] = df_train["Sex"].map( {'female': 1, 'male': 0} ).astype(int)
df_val["Sex"] = df_val["Sex"].map( {'female': 1, 'male': 0} ).astype(int)
df_test["Sex"] = df_test["Sex"].map( {'female': 1, 'male': 0} ).astype(int)

# Embarked

In [38]:
df_train.groupby("Embarked").count()["PassengerId"].reset_index().rename(columns={"PassengerId": "embark_count"})

In [39]:
df_train = df_train.fillna({"Embarked": "S"})
df_val = df_val.fillna({"Embarked": "S"})
df_test = df_test.fillna({"Embarked": "S"})

df_train["Embarked"] = df_train["Embarked"].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
df_val["Embarked"] = df_val["Embarked"].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
df_test["Embarked"] = df_test["Embarked"].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

# Name

In [40]:
df_train["Title"] = df_train.Name.str.extract(" ([A-Za-z]+)\.", expand=False)
df_val["Title"] = df_val.Name.str.extract(" ([A-Za-z]+)\.", expand=False)
df_test["Title"] = df_test.Name.str.extract(" ([A-Za-z]+)\.", expand=False)

df_train["Title"].unique()

In [41]:
# Unknown = 0
# Mr = 1
# Miss = 2
# Officer = 3
# Mrs = 4
# Master = 5
# Rojalty = 6

title_mapping = { \
    "Mr" : 1, \
    "Miss" : 2, \
    "Major": 3, \
    "Mrs" : 4, \
    "Master" : 5, \
    "Rev": 6, \
    "Dr": 6, \
    "Col": 3, \
    "Mlle": 2, \
    "Capt": 3, \
    "Mme": 4, \
    "Ms": 4, \
    "Countess": 6, \
    "Lady" : 6, \
    "Unknown": 0
}

df_train[~df_train["Title"].isin(list(title_mapping.keys()))]
df_val.loc[~df_val["Title"].isin(list(title_mapping.keys())), "Title"] = "Unknown"
df_test.loc[~df_test["Title"].isin(list(title_mapping.keys())), "Title"] = "Unknown"

df_train["Title"] = df_train["Title"].map( title_mapping ).astype(int)
df_val["Title"] = df_val["Title"].map( title_mapping ).astype(int)
df_test["Title"] = df_test["Title"].map( title_mapping ).astype(int)

# Remove Ticket, Name, Cabin

In [42]:
df_train = df_train.drop(["Ticket", "Name", "Cabin"], axis=1)
df_val = df_val.drop(["Ticket", "Name", "Cabin"], axis=1)
df_test = df_test.drop(["Ticket", "Name", "Cabin"], axis=1)

# Add FamilySize Feature

In [43]:
df_train["family_size"] = df_train["SibSp"] + df_train["Parch"] + 1
df_val["family_size"] = df_val["SibSp"] + df_val["Parch"] + 1
df_test["family_size"] = df_test["SibSp"] + df_test["Parch"] + 1

# Final data check

In [44]:
display(df_train)
display(df_train.info())

display(df_val)
display(df_val.info())


display(df_test)
display(df_test.info())

# Fill missing Fare in test set

In [45]:
df_test[df_test["Fare"].isnull()]

In [46]:
sns.displot(df_train, x="Fare", y="Pclass", kind="kde", aspect=3)

In [47]:
print(f"Mean: {df_train['Fare'].mean()}")
print(f"Median: {df_train['Fare'].median()}")
print(f"Mode: {df_train['Fare'].mode()[0]}")

In [48]:
display(df_train.groupby(["Pclass"])["Fare"].median().to_frame())
pclass_3_median_train_fare = df_train.groupby(["Pclass"])["Fare"].median().to_frame().values[2][0]
print(pclass_3_median_train_fare)

In [49]:
df_test = df_test.fillna({"Fare": pclass_3_median_train_fare}) # Fill up missing test Fare value with mode of train data

In [50]:
df_train['Survived'].value_counts().to_frame()

In [51]:
plt.figure(figsize = (14,10))

sns.heatmap(df_train.corr(), annot=True)

# Data Split in train & test

In [52]:
x_train = df_train.drop(["PassengerId", "Survived", "Embarked"], axis=1)
y_train = df_train["Survived"].to_frame()

print("****************************** TRAIN DATA ***************************\n")
display(x_train)
display(x_train.describe())
display(y_train)


x_val = df_val.drop(["PassengerId", "Survived", "Embarked"], axis=1)
y_val = df_val["Survived"].to_frame()

print("\n\n****************************** VAL DATA ***************************\n")
display(x_val)
display(x_val.describe())
display(y_val)


x_test = df_test.drop(["PassengerId", "Embarked"], axis=1)
x_passenger_ids = df_test["PassengerId"].to_frame()

print("\n\n****************************** TEST DATA ***************************\n")
display(x_test)
display(x_test.describe())
display(x_passenger_ids)


# PCA

In [53]:
pca = PCA(n_components=2)

principalComponents = pca.fit_transform(StandardScaler().fit_transform(x_train, y_train))

principalDf = pd.DataFrame(data = principalComponents, columns = ['pca1', 'pca2'])

finalPrincipalDf = pd.concat([principalDf, y_train[['Survived']]], axis = 1)

sns.scatterplot(data=finalPrincipalDf, x="pca1", y="pca2", hue="Survived", palette="deep")
print(f"Explained variance ration: {pca.explained_variance_ratio_}")

In [54]:
def grid_search(model, params, x_train, y_train):
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    pipeline = make_pipeline(StandardScaler(), model)
    
    cv = GridSearchCV(pipeline, param_grid=params, cv=kfold, scoring="accuracy", n_jobs=-1, verbose=1)
    cv.fit(x_train.to_numpy(), y_train.values.ravel())
    
    model = cv.best_estimator_[1]
    score = cv.best_score_
    
    print(f"Training: {score:.4f} {model}")
    
    return score, model

def evaluate(model, x_train, y_train, x_val, y_val):
    scaler = StandardScaler().fit(x_train, y_train)
    
    x_train = scaler.transform(x_train)
    x_val = scaler.transform(x_val)
        
    model = model.fit(x_train, y_train.values.ravel())
    score = model.score(x_val, y_val.values.ravel())
    
    print(f"Evaluation: {score:.4f} {model}")
    
    return score, model

def get_test_predictions(model, x_train, y_train, x_val, y_val, x_test):
    scaler = StandardScaler().fit(x_train, y_train)
    
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    model = model.fit(x_train, y_train.values.ravel())
    
    return model.predict(x_test)

def confusion_matrix(model, x_train, y_train, x_val, y_val):
    scaler = StandardScaler().fit(x_train, y_train)
    
    x_train = scaler.transform(x_train)
    x_val = scaler.transform(x_val)
    
    model = model.fit(x_train, y_train.values.ravel())
    
    ConfusionMatrixDisplay.from_estimator(model, x_val, y_val)

# Baseline Classifier

In [55]:
train_score, model = grid_search(DummyClassifier(random_state=42), {}, x_train, y_train)
val_score, model = evaluate(model, x_train, y_train, x_val, y_val)

In [56]:
models = []

# KNN

In [57]:
param_grid = {
    'kneighborsclassifier__n_neighbors': np.arange(1, 200, 1),
    'kneighborsclassifier__weights': ["uniform", "distance"]
}

if do_grid_search:
    train_score, model = grid_search(KNeighborsClassifier(), param_grid, x_train, y_train)
else:
    model = KNeighborsClassifier(n_neighbors=9)

In [58]:
val_score, model = evaluate(model, x_train, y_train, x_val, y_val)
confusion_matrix(model, x_train, y_train, x_val, y_val)

In [59]:
models.append({
    "type": "KNN",
    "model": model,
    "val_score": val_score,
    "train_score": train_score
})

# Random Forest

In [60]:
param_grid = {
    'randomforestclassifier__max_depth': np.arange(2, 10, 1),
    'randomforestclassifier__n_estimators': np.arange(5, 100, 1)
}

if do_grid_search:
    train_score, model = grid_search(RandomForestClassifier(random_state=42), param_grid, x_train, y_train)
else:
    model = RandomForestClassifier(max_depth=7, n_estimators=31, random_state=42)

In [61]:
val_score, model = evaluate(model, x_train, y_train, x_val, y_val)
confusion_matrix(model, x_train, y_train, x_val, y_val)

In [62]:
models.append({
    "type": "RandomForest",
    "model": model,
    "val_score": val_score,
    "train_score": train_score
})

# GradientBoostingClassifier

In [63]:
param_grid = {
    'gradientboostingclassifier__learning_rate': np.arange(0.1, 0.85, 0.15),
    'gradientboostingclassifier__n_estimators': np.arange(1, 80, 2),
    'gradientboostingclassifier__max_depth': np.arange(2, 7, 1)
}

if do_grid_search:
    train_score, model = grid_search(GradientBoostingClassifier(random_state=42), param_grid, x_train, y_train)
else:
    model = GradientBoostingClassifier(learning_rate=0.7, n_estimators=5, random_state=42)

In [64]:
val_score, model = evaluate(model, x_train, y_train, x_val, y_val)
confusion_matrix(model, x_train, y_train, x_val, y_val)

In [65]:
models.append({
    "type": "GradientBoosting",
    "model": model,
    "val_score": val_score,
    "train_score": train_score
})

# SVM

In [66]:
param_grid = {
    'svc__C': np.arange(0.01, 1, 0.01),
    'svc__gamma':  list(np.arange(0.01, 1, 0.1)) + ['scale', 'auto']
}
if do_grid_search:
    train_score, model = grid_search(SVC(random_state=42), param_grid, x_train, y_train)
else:
    model = SVC(C=0.35000000000000003, gamma=0.31000000000000005, random_state=42)

In [67]:
val_score, model = evaluate(model, x_train, y_train, x_val, y_val)
confusion_matrix(model, x_train, y_train, x_val, y_val)

In [68]:
models.append({
    "type": "SVM",
    "model": model,
    "val_score": val_score,
    "train_score": train_score
})

# AdaBoost

In [69]:
param_grid = {
    'adaboostclassifier__n_estimators': np.arange(20, 100, 1),
    'adaboostclassifier__learning_rate': np.arange(0.2, 0.9, 0.1)
}

if do_grid_search:
    train_score, model = grid_search(AdaBoostClassifier(random_state=42), param_grid, x_train, y_train)
else:
    model = AdaBoostClassifier(learning_rate=0.6, n_estimators=21, random_state=42)


In [70]:
val_score, model = evaluate(model, x_train, y_train, x_val, y_val)
confusion_matrix(model, x_train, y_train, x_val, y_val)

In [71]:
models.append({
    "type": "AdaBoost",
    "model": model,
    "val_score": val_score,
    "train_score": train_score
})

In [72]:
models.sort(key=lambda x: x['val_score'], reverse=True)
pd.DataFrame(models)

In [73]:
possible_voting_estimators = list(map(lambda x: (x['type'], x['model']), models))
possible_voting_estimator_combinations = list(itertools.combinations(possible_voting_estimators, 1)) + \
                                        list(itertools.combinations(possible_voting_estimators, 3)) + \
                                        list(itertools.combinations(possible_voting_estimators, 5))

for estimators in possible_voting_estimator_combinations:
    print(list(map(lambda x: x[0], estimators)))

In [74]:
param_grid = {
    'votingclassifier__estimators': possible_voting_estimator_combinations
}

train_score, model = grid_search(VotingClassifier(estimators=[]), param_grid, x_train, y_train)

In [75]:
val_score, model = evaluate(model, x_train, y_train, x_val, y_val)
confusion_matrix(model, x_train, y_train, x_val, y_val)

In [76]:
y_test_pred = get_test_predictions(model, x_train, y_train, x_val, y_val, x_test)

x_passenger_ids["Survived"] = y_test_pred
x_passenger_ids['Survived'].value_counts()

In [77]:
x_passenger_ids.to_csv(f"submission.csv", index=False)