##### Preparations

In [None]:
# If on kaggle, copy the contents of the "../input/" directory to the cwd of kaggle
def copy_if_kaggle():
    import os
    import shutil

    if os.getcwd() == "/kaggle/working":
        # Set the source directory
        src_dir = os.path.join("../input", os.listdir("../input")[0])
        # Copy all files from the source directory to the current directory
        for file_name in os.listdir(src_dir):
            shutil.copy(os.path.join(src_dir, file_name), ".")


copy_if_kaggle()
# You can write up to 20GB to the cwd (/kaggle/working) that gets preserved as output when you push a version
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectPercentile
from sklearn.svm import LinearSVC
from sklearn import set_config
set_config(transform_output="pandas")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mplticker
%matplotlib inline
import seaborn as sns
import plotly.io as pio
pio.templates.default = "plotly_dark"
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import squareform
from statsmodels.stats.outliers_influence import variance_inflation_factor
import optuna
# Disable trail logging
# optuna.logging.set_verbosity(optuna.logging.WARNING)
import re

import warnings
warnings.filterwarnings('ignore')
import os
os.environ["PYTHONWARNINGS"] = "ignore"

# %%capture --no-stdout --no-display
# warnings.simplefilter("ignore")

In [None]:
# Plot styling
def my_dark_style():
    from cycler import cycler

    plt.style.use("default")
    plt.style.use(["dark_background", "bmh"])
    plt.rcParams["axes.facecolor"] = "#23272e"
    plt.rcParams["figure.facecolor"] = "#23272e"
    plt.rcParams["axes.prop_cycle"] = cycler(
        "color",
        [
            "#1c90d4",
            "#ad0026",
            "#530fff",
            "#429900",
            "#d55e00",
            "#ff47ac",
            "#42baff",
            "#009e73",
            "#fff133",
            "#0072b2",
        ],
    )
    # plt.rcParams['figure.figsize'] = 9, 7
    plt.rcParams["figure.autolayout"] = True


box_kws = dict(
    boxprops={"edgecolor": "#b2b2b2"},
    capprops={"color": "#b2b2b2"},
    flierprops={"markeredgecolor": "#b2b2b2"},
    medianprops={"color": "#b2b2b2"},
    whiskerprops={"color": "#b2b2b2"},
)
# # plt.rcParams['boxplot.boxprops.edgecolor'] = '#b2b2b2' no such rcParam
# plt.rcParams['boxplot.capprops.color'] = '#b2b2b2'
# plt.rcParams['boxplot.flierprops.markeredgecolor'] = '#b2b2b2'
# plt.rcParams['boxplot.medianprops.color'] = '#b2b2b2'
# plt.rcParams['boxplot.whiskerprops.color'] = '#b2b2b2'

my_dark_style()


In [None]:
# Load dataset
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")


def joint_data():
    return pd.concat([train_data, test_data])


all_data = joint_data()


# Store our passenger ID for easy access
PassengerId = test_data.pop("PassengerId")
train_data.drop(columns="PassengerId", inplace=True)

# Initializing random seed (integer) and/or state (instance)
# pass seed to CV splitters (KFold, RepeatedStratifiedKFold, etc.)
seed = 42
# pass rng to estimators and everything else;
# initialize a new rng for each estimator in order to prevent them from influencing each other by consuming the RNG
# rng = np.random.RandomState(seed)
# rng = np.random.default_rng(seed) # new numpy random Generator, not currently supported by sklearn

# If an integer is passed, calling fit or split multiple times always yields the same results.
# If a RandomState instance is passed: fit and split will yield different results each time they are called, and the succession of calls explores all sources of entropy.

### Overview

Nominal: `Survived, Sex, Embarked, Ticket`  
Ordinal: `Pclass`  
Continuous: `Age, Fare`  
Discrete: `SibSp, Parch` 

`Survived` - 0 = No, 1 = Yes  
`Pclass` is the ticket class - 1 = 1st, 2 = 2nd, 3 = 3rd  
`SibSp` is the number of siblings / the number of spouses aboard the Titanic  
`Parch` is the number of parents / the number children aboard the Titanic  
`Embarked` is the port of embarkation	- C = Cherbourg, Q = Queenstown, S = Southampton  

In [None]:
print(train_data.info())
train_data.sample(3)


In [None]:
print(test_data.info())
test_data.sample(3)


In [None]:
train_data.describe(include="all")


### Exploratory Data Analysis and Data Cleaning

##### Basic visualizations

Passengers with first-class tickets had the highest survival rate, while those in third class had the lowest survival rate.

In [None]:
sns.countplot(train_data, x="Pclass", hue="Survived")


 The survival rate of females was significantly higher than that of men.

In [None]:
sns.countplot(train_data, x="Sex", hue="Survived")


 Passengers with no siblings/spouses/parents/children on board with them seem to have had a lower survival rate than those with a few companions.

In [None]:
_, axs = plt.subplots(1, 2, figsize=(12, 5))
for ax, col in enumerate(["SibSp", "Parch"]):
    sns.countplot(train_data, x=col, hue="Survived", ax=axs[ax])


Those who embarked form the Southampton port had the lowest survival rate. Considering the numbers and order of embarkation (S->C->Q), it is also reasonable to assume that the majority of 3rd class passengers embarked form the Southampton port.

In [None]:
sns.countplot(train_data, x="Embarked", hue="Survived")


Children and elderly passengers had the highest survival rates, whereas those between the age of 20 and 30 had the lowest chance of surviving.

In [None]:
sns.displot(
    train_data, x="Age", hue="Survived", binwidth=10, binrange=(0, 80), kde=True
)


Those with the cheapest tickets had the lowest survival rate. The distribution is strongly skewed to the right. Most tickets costing below 10, and few cost above 100.

In [None]:
_, axs = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(
    train_data,
    x="Fare",
    hue="Survived",
    binwidth=50,
    binrange=(0, 600),
    kde=True,
    ax=axs[0],
)
sns.histplot(
    train_data[train_data.Fare > 0],
    x="Fare",
    hue="Survived",
    log_scale=True,
    kde=True,
    ax=axs[1],
)
axs[0].set_title("Original")
axs[1].set_title("Log scale")


##### Missing values

There are missing values in the `Age, Fare, Cabin` and `Embarked` features. All imputation of missing values is done in a manner that avoids leakage, so some steps are included in a pipeline.

In [None]:
pd.DataFrame(
    [train_data.isna().sum(), test_data.isna().sum()], index=["Train", "Test"]
).T


There is only one passenger with a missing `Fare` value. `Fare` is related to `Pclass`, `Embarked` and family size (`Parch` and `SibSp`) features. Median `Fare` value of a third class ticket with S as the port of embarkation for a passenger with no family is a logical choice to fill the missing value with.


In [None]:
test_data[test_data["Fare"].isna()]


In [None]:
# Blame black formatter for the lousy formatting..
test_data["Fare"].fillna(
    train_data.groupby(["Pclass", "Embarked", "Parch", "SibSp"]).Fare.mean()[3]["S"][0][
        0
    ],
    inplace=True,
)


There are only 2 missing values for `Embarked`. They are filled with the most frequent value.

In [None]:
train_data["Embarked"].fillna("S", inplace=True)


`Age` missing values are imputed using the median value grouped by `Pclass` and `Sex` (as they are likely indicators of age)

In [None]:
class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        sex_column = [col for col in X.columns if re.match(r"^Sex", col)][0]
        self.age_median_group = X.groupby([sex_column, "Pclass"])["Age"].median()
        return self

    def transform(self, X, y=None):
        sex_column = [col for col in X.columns if re.match(r"^Sex", col)][0]
        X = X.copy()
        X["Age"] = X.apply(
            lambda x: self.age_median_group[x[sex_column], x["Pclass"]]
            if pd.isna(x["Age"])
            else x["Age"],
            axis=1,
        )
        return X


# age_imputer is later included in the pipeline
age_imputer = AgeImputer()
sns.catplot(data=train_data, x="Pclass", y="Age", hue="Sex", kind="box", **box_kws)
plt.show()
age_imputer.fit(train_data).age_median_group


In [None]:
# `Age` missing values are imputed using the mean value from the nearest neighbors (using `Pclass` and `Sex` as features, as they are likely indicators of age)

In [None]:
# _, axs = plt.subplots(1, 4, figsize=(15, 5))
# for ax, col in enumerate(["Pclass", "Sex"]):
#     sns.boxplot(
#         train_data,
#         x=col,
#         y="Age",
#         ax=axs[ax],
#         # boxprops={"edgecolor": "#b2b2b2"},
#         # capprops={"color": "#b2b2b2"},
#         # flierprops={"markeredgecolor": "#b2b2b2"},
#         # medianprops={"color": "#b2b2b2"},
#         # whiskerprops={"color": "#b2b2b2"},
#         **box_kws
#     )


In [None]:
# train_data[age_na := train_data["Age"].isna()].head(3)


In [None]:
# neighbors = ["Age", "Pclass", "Sex"]
# # Encode `Sex`
# ord_enc = OneHotEncoder(drop="first", sparse_output=False)
# train_data["Sex"] = ord_enc.fit_transform(train_data[["Sex"]])
# test_data["Sex"] = ord_enc.transform(test_data[["Sex"]])
# # Scale
# ss = StandardScaler()
# train_data[neighbors] = ss.fit_transform(train_data[neighbors])
# test_data[neighbors] = ss.transform(test_data[neighbors])
# # Impute `Age`
# knn_imp = KNNImputer(n_neighbors=3, weights="distance")
# train_data[neighbors] = knn_imp.fit_transform(train_data[neighbors])
# test_data[neighbors] = knn_imp.transform(test_data[neighbors])
# # Temporarily format back to previous scale and decode `Sex` back to male/female for interpretability
# for data in [train_data, test_data]:
#     data[neighbors] = ss.inverse_transform(data[neighbors])
#     data[neighbors[1:]] = data[neighbors[1:]].round().astype(int)
#     data["Sex"] = ord_enc.inverse_transform(data[["Sex"]])
# # CV flag


In [None]:
# train_data[age_na].head(3)


There are far too many missing `Cabin` values to meaningfully impute them, however the feature can't be ignored because some of the cabins might have higher survival rates. Another point of interest is that some passengers had multiple cabins (all on the same deck). Those with several decks paid a significantly higher fare price. Some cabin codes are preceded by an F. It is difficult to determine what this means, but judging by the price, the F is not a separate cabin, so it is removed and treated as a single cabin due to the low number of records containing it.

In [None]:
joint_data()[
    joint_data()["Cabin"].map(lambda x: len(x.split()), na_action="ignore") > 1
].head(10)


In [None]:
# Remove F_ from cabin codes
for data in [train_data, test_data]:
    data.loc[:, "Cabin"] = data["Cabin"].map(
        (lambda x: x.replace("F ", "")), na_action="ignore"
    )


An extra feature `NCabins` is created which contains the cabin count per passenger.

In [None]:
for data in [train_data, test_data]:
    data.loc[:, "NCabins"] = data["Cabin"].map(
        (lambda x: len(x.split())), na_action="ignore"
    )
    data.NCabins.fillna(1, inplace=True)

joint_data().NCabins.value_counts()


Passengers with multiple cabins have a higher survival rate compared to those with only one cabin.

In [None]:
sns.displot(train_data, x="NCabins", hue="Survived", multiple="fill", discrete=True)
plt.ylabel("Survival Ratio")
plt.gca().xaxis.set_major_locator(mplticker.MultipleLocator(1))


In [None]:
# joint_data().groupby("Cabin", as_index=False).Ticket.nunique()[
#     joint_data()
#     .groupby("Cabin", as_index=False)
#     .Ticket.count()["Cabin"]
#     .map(lambda x: len(x.split()), na_action="ignore")
#     > 1
# ]


The first letter of each cabin code corresponds to the deck level where the cabin is located. `Deck` is extracted from the `Cabin` and the missing values simply encoded as "M". This way the missing values can be dealt with as a separate category of the `Deck` feature by the final model.

In [None]:
for data in [train_data, test_data]:
    data["Cabin"].fillna("M", inplace=True)
    data["Cabin"] = data.Cabin.apply(lambda x: x[0])
    data.rename(columns={"Cabin": "Deck"}, inplace=True)

joint_data().Deck.value_counts()


# XXXXXXX ordinal or nominal?

`Deck` is an ordinal categorical feature. A is the topmost deck, G is the lowest passenger deck, T is the lowest deck of a ship (where the engines and boiler rooms are).  
Passengers on the middle decks B through F had the highest survival rates, whereas those on the lowest passenger deck, and those whose cabin codes are missing, had the lowest success rate. The only passenger whose cabin was on the Tank Top deck (below the Orlop Deck) did not survive.

In [None]:
sns.displot(
    train_data.sort_values(by="Deck"), x="Deck", hue="Survived", multiple="fill"
)
plt.ylabel("Survival Ratio")


All missing values have been dealt with.

### Feature Engineering

`Family_Size` is created by adding `SibSp, Parch` and 1. Those who travel entirely alone don't seem to have had the highest chance of survival.

In [None]:
for data in [train_data, test_data]:
    data["Family_Size"] = data.SibSp + data.Parch + 1

sns.countplot(train_data, x="Family_Size", hue="Survived")


`Ticket_Freq` is created by encoding the frequency with which a ticket occurs, which gives an idea of the size of the group in which one was traveling.

In [None]:
train_data["Ticket_Freq"] = (
    joint_data().groupby("Ticket")["Ticket"].transform("count")[:891]
)
test_data["Ticket_Freq"] = (
    joint_data().groupby("Ticket")["Ticket"].transform("count")[891:]
)

sns.countplot(train_data, x="Ticket_Freq", hue="Survived")


`Title` is extracted from the name. All titles with less than 10 samples are joined into a separate category 'Other'. Those with the title Mr. had the lowest survival rate. Mrs. had a higher survival rate than Miss.

In [None]:
for data in [train_data, test_data]:
    data["Title"] = (
        data["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    )
train_data.Title.value_counts()

In [None]:
train_data.loc[
    train_data.groupby("Title")["Title"].transform("size") < 10, "Title"
] = "Other"
test_data.loc[~test_data["Title"].isin(train_data["Title"]), "Title"] = "Other"
train_data.Title.value_counts()

In [None]:
sns.countplot(train_data, x="Title", hue="Survived")

In [None]:
# def extract_surname(name):
#     if "(" in name:
#         name_no_bracket = name.split("(")[0]
#     else:
#         name_no_bracket = name
#     family = name_no_bracket.split(",")[0]
#     return family


# for data in [train_data, test_data]:
#     data["Name"] = data.Name.apply(extract_surname)

# train_data["Surname_Freq"] = (
#     joint_data().groupby("Name")["Name"].transform("count")[:891]
# )
# test_data["Surname_Freq"] = (
#     joint_data().groupby("Name")["Name"].transform("count")[891:]
# )

# sns.countplot(train_data, x="Surname_Freq", hue="Survived")


`Name` and `Ticket` are dropped as they are no longer needed and contain no useful information.

In [None]:
for data in [train_data, test_data]:
    data.drop(columns=["Name", "Ticket"], inplace=True)


It is worth noting, that some passengers have a `Fare` of 0. Ismay traveled on a complimentary ticket, as well as his servants (Fry and Harrison), Andrews and the Guarantee Group, and Reuchlin. Assuming that the rest of the passengers who hadn't paid for their tickets also received complimentary tickets, a separate 1/0 (True/False) feature `Complimentary` is created to indicate this.

In [None]:
joint_data()[joint_data().Fare == 0]


In [None]:
for data in [train_data, test_data]:
    data["Complimentary"] = data.Fare.apply(lambda x: 1 if x == 0 else 0)

joint_data()[joint_data().Fare == 0].sample(3)


`Fare` is transformed to log scale to deal with right skewness.

In [None]:
train_data.Fare.sort_values().head(50)

In [None]:
for data in [train_data, test_data]:
    data["Log_Fare"] = np.log1p(data["Fare"])

_, axs = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(
    train_data,
    x="Fare",
    hue="Survived",
    binwidth=50,
    binrange=(0, 600),
    kde=True,
    ax=axs[0],
)
sns.histplot(
    train_data,
    x="Log_Fare",
    hue="Survived",
    kde=True,
    ax=axs[1],
)
axs[0].set_title("Original")
axs[1].set_title("Transformed to Log scale")

for data in [train_data, test_data]:
    data.drop(columns="Fare", inplace=True)


In [None]:
1 / (train_data.Log_Fare.sort_values().head(50) + 1)


The ratio between the price and quantity could prove to be a useful feature, so it is created by dividing the `Log_Fare` by `Ticket_Freq`.

In [None]:
for data in [train_data, test_data]:
    data["Log_Fare/Ticket_Freq"] = data["Log_Fare"] / data["Ticket_Freq"]

sns.histplot(
    train_data,
    x="Log_Fare/Ticket_Freq",
    hue="Survived",
    kde=True,
)


##### Encoding

Nominal: `Survived, Sex, Deck, Embarked, Title, Complimentary`  
Ordinal: `Pclass`  
Continuous: `Age, Log_Fare, Log_Fare/Ticket_Freq`  
Discrete: `SibSp, Parch, NCabins, Family_Size, Ticket_Freq` 

In [None]:
# Might come in handy
nominal = ["Survived", "Sex", "Deck", "Embarked", "Title", "Complimentary"]
ordinal = ["Pclass"]
continuous = ["Age", "Log_Fare", "Log_Fare/Ticket_Freq"]
discrete = ["SibSp", "Parch", "NCabins", "Family_Size", "Ticket_Freq"]


In [None]:
joint_data().head()


All features except `Sex, Deck, Embarked` and `Title` are already properly encoded, so only they are encoded (as a one-hot numeric array). Dummy variables include redundant information, so to overcome the Dummy variable Trap, one dummy per categorical variable is dropped. The choice of which dummy variable to drop is arbitrary and doesn't affect the model's overall performance, so the first is dropped automatically by OneHotEncoder. 

In [None]:
feat = ["Sex", "Deck", "Embarked", "Title"]
ohe = OneHotEncoder(drop="first", sparse_output=False)
train_data = train_data.drop(columns=feat).join(ohe.fit_transform(train_data[feat]))
test_data = test_data.drop(columns=feat).join(ohe.transform(test_data[feat]))
# for data in [train_data, test_data]:
#     data.drop(columns=["Deck_T", "Embarked_Q"], inplace=True)

joint_data().info()


##### Dealing with Multicollinearity

In [None]:
class SelectCorrelationClusters(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        threshold=0.7,
        corr_method="spearman",
        linkage_method="ward",
        optimal_ordering=True,
    ):
        self.threshold = threshold
        self.corr_method = corr_method
        self.linkage_method = linkage_method
        self.optimal_ordering = optimal_ordering

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        # Remove constant columns and calculate correlation
        self.constant_cols = np.where(X.nunique() == 1)[0]
        self.corr_ = X.drop(columns=X.columns[self.constant_cols]).corr(
            method=self.corr_method
        )
        # Convert the correlation matrix to a distance matrix before performing hierarchical clustering
        distance_matrix = 1 - self.corr_.abs()
        self.Z = linkage(
            squareform(distance_matrix, checks=False),
            "ward",
            optimal_ordering=self.optimal_ordering,
        )
        # Extract the cluster labels for each feature based on the specified threshold
        clusters = fcluster(self.Z, self.threshold, criterion="distance")
        # Filter out highly correlated features
        # Keep one feature per cluster with the maximum average correlation
        self.kept_features = []
        for cluster_id in set(clusters):
            clusters_boolean = clusters == cluster_id
            corr_vals = self.corr_.iloc[clusters_boolean, clusters_boolean]
            best_feature_idx = corr_vals.abs().sum(axis=1).idxmax()
            self.kept_features.append(corr_vals.columns.get_loc(best_feature_idx))
        return self

    def transform(self, X, y=None):
        X = pd.DataFrame(X)
        return X.drop(columns=X.columns[self.constant_cols]).iloc[:, self.kept_features]

    def plot(self, annot=True, cmap="RdBu", split=True):
        if not split:
            # Plot clustermap
            sns.clustermap(
                self.corr_.round(2),
                row_linkage=self.Z,
                col_linkage=self.Z,
                cmap=cmap,
                annot=annot,
                annot_kws={"size": 8},
                vmin=-1,
                vmax=1,
                figsize=(15, 12),
                dendrogram_ratio=0.2,
            )
        else:
            # Plot dendrogram with correlation heatmap
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17, 8))
            dendro = dendrogram(
                self.Z,
                labels=self.corr_.columns,
                ax=ax1,
                leaf_rotation=90,
            )
            order = dendro["leaves"]
            sns.heatmap(
                self.corr_.iloc[order, order],
                cmap=cmap,
                annot=annot,
                vmin=-1,
                vmax=1,
                # linewidths=0.01,
                # linecolor="#23272e",
                ax=ax2,
            )
            fig.tight_layout()
        plt.show()


def calculate_vif(X):
    # Add constant column
    # X = add_constant(X)
    X = X.assign(const=1)

    # Calculate VIF for each variable
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return vif


When a dummy variable that represents more than two categories has a high VIF, multicollinearity does not necessarily exist. The variables will always have high VIFs if there is a small portion of cases in the category, regardless of whether the categorical variables are correlated to other variables, so dummy encoded nominal variables are ignored.  
`Family_Size, SibSp` and `Parch` have an infinite VIF score and are perfectly multicollinear. They all have high correlation scores, along with `Ticket_Freq`. Considering their MI scores, `Family_Size` is kept and the rest are dropped, along with `Log_Fare`, which also has high correlation scores with those previously mentioned and with `Log_Fare/Ticket_Freq`, with the exception that it also has high correlation with `Pclass` (which the rest do not), which results in a higher VIF score than `Log_Fare/Ticket_Freq`.

In [None]:
# impd_age is needed to demonstrate how things work with imputed age values,
# which will actually be imputed later, during CV
impd_age = age_imputer.fit_transform(train_data)

In [None]:
# %%capture --no-display
display(calculate_vif(impd_age[discrete + continuous + ordinal]))
SelectCorrelationClusters().fit(impd_age[discrete + continuous + ordinal]).plot()
mutinf = pd.Series(
    mutual_info_classif(
        impd_age.drop(columns="Survived"),
        train_data["Survived"],
        discrete_features=[
            True
            if col
            not in [
                "Age",
                "Log_Fare",
                "Log_Fare/Ticket_Freq",
            ]
            else False
            for col in train_data.drop(columns="Survived").columns
        ],
        # discrete_features=True,
        random_state=seed,
    ),
    index=train_data.drop(columns="Survived").columns,
).sort_values(ascending=False)
sns.barplot(x=mutinf, y=mutinf.index.astype(str))
plt.xlabel("Mutual Information score")
plt.tight_layout()
plt.show()

In [None]:
for data in [train_data, test_data]:
    data.drop(
        columns=[
            "SibSp",
            "Parch",
            "Ticket_Freq",
            "Log_Fare",
        ],
        inplace=True,
    )

# Might come in handy
nominal = ["Survived", "Sex", "Deck", "Embarked", "Title", "Complimentary"]
ordinal = ["Pclass"]
continuous = ["Age", "Log_Fare/Ticket_Freq"]
discrete = ["NCabins", "Family_Size"]

Now all VIF scores are within reasonable bounds.

In [None]:
calculate_vif(impd_age[discrete + continuous + ordinal])

##### Reciprocal

Next the multiplicative inverse of ordinal/continuous/discrete features is added.

In [None]:
class ReciprocalTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Store thresholds by column name
        self.threshold = {}
        # Exclude columns that are OHE dummies
        self.selected_columns = X.loc[
            :, ~np.all(np.logical_or(X == 0, X == 1), axis=0)
        ].columns
        # Get thresholds
        X[self.selected_columns].apply(self.get_thresholds)
        return self

    def transform(self, X, y=None):
        # Apply the transform_zeros function to each selected column
        transformed_columns = X[self.selected_columns].apply(self.transform_zeros)
        # Take the reciprocal of the transformed columns
        reciprocals = 1 / transformed_columns
        return X.join(reciprocals, rsuffix="_inverse")

    def get_thresholds(self, column):
        # Find the minimum positive non-zero value (value closest to zero) in the column
        min_nonzero = np.min(column[column != 0].abs())
        # Calculate the closest 10^n value below the minimum non-zero value in the column
        self.threshold[column.name] = 10 ** np.floor(np.log10(min_nonzero))

    def transform_zeros(self, column):
        column_values = column.copy()
        # Find zero values in the column and replace them with the threshold value
        column_values.loc[column_values == 0] = self.threshold[column_values.name]
        return column_values


ReciprocalTransformer().fit(train_data).transform(train_data).filter(
    regex="_inverse$"
).head()


### Model training and selection

In [None]:
# Prepare data
X = train_data.copy()
y = X.pop("Survived")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=seed, stratify=y
)


In [None]:
# SelectFromModel(
#     ExtraTreesClassifier(
#         random_state=seed,
#     ),
#     threshold="0.01*mean",
# ).fit(
#     impd_age, y
# ).transform(impd_age)

In [None]:
# Define score function to go into SelectPercentile
discrete_features = [
    True
    if col
    not in [
        "Age",
        "Log_Fare/Ticket_Freq",
        "Age_inverted",
        "Log_Fare/Ticket_Freq_inverted",
    ]
    else False
    for col in X.columns
]


def mut_info(X, y):
    return mutual_info_classif(
        X,
        y,
        discrete_features=discrete_features,
        random_state=seed,
    )


# Define transformer to clean up after adding feature interactions and standardizing
class RemoveDuplicates(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        df = pd.DataFrame(X)
        self.not_dup = list(df.columns[~df.T.duplicated()])
        return self

    def transform(self, X):
        df = pd.DataFrame(X)
        return df.loc[:, self.not_dup]  # .to_numpy()


# Define function to create a DataFrame from top scores
def get_top_scores(study, score_count=7):
    df = (
        study.trials_dataframe(attrs=("value", "duration", "params"))
        .sort_values(by="value", ascending=False)
        .head(score_count)
    ).rename(
        columns=lambda x: re.sub(r"^params_", "", x)
    )  # remove params_ from col names
    df["duration"] = df["duration"].dt.total_seconds()
    return df


# Define function to calculate validation/test scores for top sets of hyperparameters
def model_test_score(model, study_name, top_scores, results_storage):
    res = top_scores.value.to_list()
    for i in range(len(top_scores)):
        # Create dictionary of model parameters
        model_params = (
            top_scores.iloc[i]
            .drop(
                [
                    "value",
                    "duration",
                    "MI_percentile",
                    "reciprocal",
                    "selection_l1_C",
                    "l1_C",
                    "selection_RF_FI",
                    "ET_threshold",
                ]
            )
            .to_dict()
        )
        # Assign model parameters to estimator
        model.set_params(**model_params)
        # Create pipeline
        pipe = make_pipeline(
            age_imputer,
            SelectPercentile(mut_info, percentile=top_scores.iloc[i].MI_percentile),
            standardize,
            cleanup,
            model,
        )
        # Update pipeline with extra steps if needed
        if top_scores.iloc[i].reciprocal:
            pipe.steps.insert(
                2,
                (
                    "reciprocal",
                    inverse,
                ),
            )
        if top_scores.iloc[i].selection_l1_C:
            pipe.steps.insert(
                -1,
                (
                    "l1_selection",
                    SelectFromModel(
                        LinearSVC(
                            C=top_scores.iloc[i].l1_C,
                            penalty="l1",
                            dual=False,
                            max_iter=100000,
                            random_state=seed,
                        )
                    ),
                ),
            )
        if top_scores.iloc[i].selection_RF_FI:
            pipe.steps.insert(
                -1,
                (
                    "ET_selection",
                    SelectFromModel(
                        ExtraTreesClassifier(
                            random_state=seed,
                        ),
                        threshold=f"{top_scores.iloc[i].ET_threshold}*mean",
                    ),
                ),
            )
        # Get and store score results
        res.append(pipe.fit(X_train, y_train).score(X_test, y_test))
    # Add duration
    res.extend(top_scores.duration.to_list())
    # Assign results to proper index (model)
    results_storage.loc[study_name] = res
    # Format for display
    df = results_storage.loc[study_name].to_frame().reset_index()
    df[["col", "index"]] = df["index"].str.split("_", expand=True)
    df = df.pivot(index="index", columns="col")
    df.columns = df.columns.droplevel()
    df.columns.name, df.index.name = None, None
    column_order = ["cv", "test", "duration"]
    display(df.reindex(column_order, axis=1))


# Define function to generate n top submissions
def submissions_from_model(
    model, study_name, top_scores, results_storage, n_submissions=3
):
    # Sort parameters by test scores
    scores_by_test = top_scores.copy()
    scores_by_test["sort"] = model_results.loc[study_name][
        len(scores_by_test) : len(scores_by_test) * 2
    ].to_numpy()
    scores_by_test.sort_values(by="sort", ascending=False, inplace=True)
    scores_by_test.drop(columns="sort", inplace=True)

    for i in range(n_submissions):
        # Create dictionary of model parameters
        model_params = (
            top_scores.iloc[i]
            .drop(
                [
                    "value",
                    "duration",
                    "MI_percentile",
                    "reciprocal",
                    "selection_l1_C",
                    "l1_C",
                    "selection_RF_FI",
                    "ET_threshold",
                ]
            )
            .to_dict()
        )
        # Assign model parameters to estimator
        model.set_params(**model_params)
        # Create pipeline
        pipe = make_pipeline(
            age_imputer,
            SelectPercentile(mut_info, percentile=top_scores.iloc[i].MI_percentile),
            standardize,
            cleanup,
            model,
        )
        # Update pipeline with extra steps if needed
        if top_scores.iloc[i].reciprocal:
            pipe.steps.insert(
                2,
                (
                    "reciprocal",
                    inverse,
                ),
            )
        if top_scores.iloc[i].selection_l1_C:
            pipe.steps.insert(
                -1,
                (
                    "l1_selection",
                    SelectFromModel(
                        LinearSVC(
                            C=top_scores.iloc[i].l1_C,
                            penalty="l1",
                            dual=False,
                            max_iter=100000,
                            random_state=seed,
                        )
                    ),
                ),
            )
        if top_scores.iloc[i].selection_RF_FI:
            pipe.steps.insert(
                -1,
                (
                    "ET_selection",
                    SelectFromModel(
                        ExtraTreesClassifier(
                            random_state=seed,
                        ),
                        threshold=f"{top_scores.iloc[i].ET_threshold}*mean",
                    ),
                ),
            )
        # Generate predictions
        predictions = pipe.fit(X, y).predict(test_data)
        # Create submission CSV
        predictions_df = pd.DataFrame(
            {"PassengerId": PassengerId, "Survived": predictions}
        )
        predictions_df.to_csv(
            f"{study_name}_submission_{i+1}.csv", header=True, index=False
        )

In [None]:
# Initialize pipeline steps
inverse = ReciprocalTransformer()
standardize = StandardScaler()
cleanup = RemoveDuplicates()

# Define DB file name for storing tuning results
db_name = "titanic_study"
storage_name = f"sqlite:///{db_name}.db"

# Create DataFrame to store results of model evaluations
score_count = 7
model_results = pd.DataFrame(
    columns=[f"cv_{i}" for i in range(1, score_count + 1)]
    + [f"test_{i}" for i in range(1, score_count + 1)]
    + [f"duration_{i}" for i in range(1, score_count + 1)]
)

In [None]:
# Turn off optuna log notes.
optuna.logging.set_verbosity(optuna.logging.WARN)


# Define a function to output a log only when the best value is updated
def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        print(
            "Trial {} finished with best value: {} and parameters: {}. ".format(
                frozen_trial.number,
                frozen_trial.value,
                frozen_trial.params,
            )
        )

In [None]:
# Define the objective of the study without the model
def cv_pipe(trial, model):
    # Create Pipeline
    pipe = make_pipeline(
        age_imputer,
        SelectPercentile(
            mut_info, percentile=trial.suggest_int("MI_percentile", 1, 100)
        ),
        standardize,
        cleanup,
        # perform SelectFromModel from RF feature importances? what threshold?
        model,
    )
    if trial.suggest_categorical("reciprocal", [True, False]):
        pipe.steps.insert(
            2,
            (
                "reciprocal",
                inverse,
            ),
        )
    if trial.suggest_categorical("selection_l1_C", [True, False]):
        pipe.steps.insert(
            -1,
            (
                "l1_selection",
                SelectFromModel(
                    LinearSVC(
                        C=trial.suggest_float("l1_C", 1e-2, 20, log=True),
                        penalty="l1",
                        dual=False,
                        max_iter=100000,
                        random_state=seed,
                    )
                ),
            ),
        )
    if trial.suggest_categorical("selection_RF_FI", [True, False]):
        pipe.steps.insert(
            -1,
            (
                "ET_selection",
                SelectFromModel(
                    ExtraTreesClassifier(
                        random_state=seed,
                    ),
                    threshold=f"{trial.suggest_float('ET_threshold', 1e-2, 1)}*mean",
                ),
            ),
        )
    # Calculate scoring metric
    cv_score = cross_val_score(
        pipe,
        X_train,
        y_train,
        cv=RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=seed),
        n_jobs=-1,
    ).mean()
    return cv_score


##### Logistic Regression

In [None]:
rng = np.random.RandomState(seed)
# Initialize model
model = LogisticRegression(
    max_iter=100000,
    n_jobs=-1,
    random_state=rng,
)


# Define objective with the model
def objective_w_model(trial):
    # Set estimator parameters
    model.set_params(
        C=trial.suggest_float("C", 1e-4, 1000, log=True),
        solver=trial.suggest_categorical(
            "solver", ["liblinear", "lbfgs", "newton-cg", "sag", "saga"]
        ),
    )
    return cv_pipe(trial, model)


In [None]:
# Define unique identifier of the study
study_name = "Logistic_Regression"

study = optuna.create_study(
    storage=storage_name,
    study_name=study_name,
    sampler=optuna.samplers.TPESampler(
        multivariate=True, seed=seed, warn_independent_sampling=False
    ),
    direction="maximize",
    load_if_exists=True,
)
print(f"Sampler: {study.sampler.__class__.__name__}")

In [None]:
# # Perform hyperparameter optimization search
# study.optimize(
#     objective_w_model,
#     n_trials=30,
#     timeout=300,
#     # catch=(ValueError),
#     # callbacks=[logging_callback],
# )


In [None]:
# Perform hyperparameter optimization search
study.optimize(
    objective_w_model,
    n_trials=700,
    timeout=3600,
    catch=(ValueError),
    # callbacks=[logging_callback],
)


In [None]:
top_scores = get_top_scores(study)
top_scores


In [None]:
model_test_score(model, study_name, top_scores, model_results)


In [None]:
submissions_from_model(model, study_name, top_scores, model_results)


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

##### K-Nearest Neighbours

In [None]:
rng = np.random.RandomState(seed)
# Initialize estimator
model = KNeighborsClassifier(
    n_jobs=-1,
)


# Define objective with the model
def objective_w_model(trial):
    # Set estimator parameters
    model.set_params(n_neighbors=trial.suggest_int("n_neighbors", 4, 40))
    return cv_pipe(trial, model)


In [None]:
# Define unique identifier of the study
study_name = "K-Nearest_Neighbors"

study = optuna.create_study(
    storage=storage_name,
    study_name=study_name,
    sampler=optuna.samplers.TPESampler(seed=seed),
    direction="maximize",
    load_if_exists=True,
)
print(f"Sampler: {study.sampler.__class__.__name__}")


In [None]:
# Perform hyperparameter optimization search
study.optimize(
    objective_w_model,
    n_trials=700,
    timeout=3600,
    catch=(ValueError),
    # callbacks=[logging_callback],
)


In [None]:
top_scores = get_top_scores(study)
top_scores


In [None]:
model_test_score(model, study_name, top_scores, model_results)


In [None]:
submissions_from_model(model, study_name, top_scores, model_results)


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

##### Random Forest

In [None]:
rng = np.random.RandomState(seed)
# Initialize estimator
model = RandomForestClassifier(n_jobs=-1, random_state=rng)


# Define objective with the model
def objective_w_model(trial):
    # Set estimator parameters
    model.set_params(
        max_depth=trial.suggest_int("max_depth", 5, 50),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 3, 150),
        max_features=trial.suggest_float("max_features", 0.2, 0.8),
        ccp_alpha=trial.suggest_float("ccp_alpha", 1e-9, 0.1),
    )
    return cv_pipe(trial, model)

In [None]:
# Define unique identifier of the study
study_name = "Random_Forest"

study = optuna.create_study(
    storage=storage_name,
    study_name=study_name,
    sampler=optuna.samplers.TPESampler(seed=seed),
    direction="maximize",
    load_if_exists=True,
)
print(f"Sampler: {study.sampler.__class__.__name__}")


In [None]:
# Perform hyperparameter optimization search
study.optimize(
    objective_w_model,
    n_trials=700,
    timeout=3600,
    catch=(ValueError),
    # callbacks=[logging_callback],
)


In [None]:
top_scores = get_top_scores(study)
top_scores


In [None]:
model_test_score(model, study_name, top_scores, model_results)


In [None]:
submissions_from_model(model, study_name, top_scores, model_results)


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

##### Support Vector Machine

In [None]:
rng = np.random.RandomState(seed)
# Initialize estimator
model = SVC(gamma="scale", max_iter=100000, random_state=rng, cache_size=1000)


# Define objective with the model
def objective_w_model(trial):
    # Set estimator parameters
    model.set_params(
        C=trial.suggest_float("C", 1e-4, 1000, log=True),
        kernel=trial.suggest_categorical("kernel", ["linear", "rbf", "sigmoid"]),
    )
    return cv_pipe(trial, model)

In [None]:
# Define unique identifier of the study
study_name = "Support Vector Machine"

study = optuna.create_study(
    storage=storage_name,
    study_name=study_name,
    sampler=optuna.samplers.TPESampler(seed=seed),
    direction="maximize",
    load_if_exists=True,
)
print(f"Sampler: {study.sampler.__class__.__name__}")


In [None]:
# Perform hyperparameter optimization search
study.optimize(
    objective_w_model,
    n_trials=700,
    timeout=3600,
    catch=(ValueError),
    # callbacks=[logging_callback],
)


In [None]:
top_scores = get_top_scores(study)
top_scores


In [None]:
model_test_score(model, study_name, top_scores, model_results)


In [None]:
submissions_from_model(model, study_name, top_scores, model_results)


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

##### AdaBoost

In [None]:
rng = np.random.RandomState(seed)
# Initialize estimator
model = AdaBoostClassifier(estimator=DecisionTreeClassifier(), random_state=rng)


# Define objective with the model
def objective_w_model(trial):
    # Set estimator parameters
    model.set_params(
        estimator__max_depth=trial.suggest_int("estimator__max_depth", 1, 3),
        n_estimators=trial.suggest_int("n_estimators", 10, 100),
        learning_rate=trial.suggest_float("learning_rate", 1e-7, 1, log=True),
    )
    return cv_pipe(trial, model)

In [None]:
# Define unique identifier of the study
study_name = "AdaBoost"

study = optuna.create_study(
    storage=storage_name,
    study_name=study_name,
    sampler=optuna.samplers.TPESampler(seed=seed),
    direction="maximize",
    load_if_exists=True,
)
print(f"Sampler: {study.sampler.__class__.__name__}")


In [None]:
# Perform hyperparameter optimization search
study.optimize(
    objective_w_model,
    n_trials=700,
    timeout=3600,
    catch=(ValueError),
    # callbacks=[logging_callback],
)


In [None]:
top_scores = get_top_scores(study)
top_scores


In [None]:
model_test_score(model, study_name, top_scores, model_results)


In [None]:
submissions_from_model(model, study_name, top_scores, model_results)


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

##### Gradient Boosting

In [None]:
rng = np.random.RandomState(seed)
# Initialize estimator
model = GradientBoostingClassifier(
    random_state=rng,
)


# Define objective with the model
def objective_w_model(trial):
    # Set estimator parameters
    model.set_params(
        n_estimators=trial.suggest_int("n_estimators", 30, 400),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        subsample=trial.suggest_float("subsample", 0.1, 1),
        min_samples_split=trial.suggest_float("min_samples_leaf", 1e-3, 1e-1, log=True),
        max_depth=trial.suggest_int("max_depth", 2, 30),
        max_features=trial.suggest_float("max_features", 0.1, 0.9),
        ccp_alpha=trial.suggest_float("ccp_alpha", 1e-9, 0.1),
    )
    return cv_pipe(trial, model)

In [None]:
# Define unique identifier of the study
study_name = "Gradient Boosting"

study = optuna.create_study(
    storage=storage_name,
    study_name=study_name,
    sampler=optuna.samplers.TPESampler(seed=seed),
    direction="maximize",
    load_if_exists=True,
)
print(f"Sampler: {study.sampler.__class__.__name__}")


In [None]:
# Perform hyperparameter optimization search
study.optimize(
    objective_w_model,
    n_trials=700,
    timeout=3600,
    catch=(ValueError),
    # callbacks=[logging_callback],
)


In [None]:
top_scores = get_top_scores(study)
top_scores


In [None]:
model_test_score(model, study_name, top_scores, model_results)


In [None]:
submissions_from_model(model, study_name, top_scores, model_results)


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

##### XGBoost

In [None]:
rng = np.random.RandomState(seed)
# Initialize estimator
model = XGBClassifier(booster="gbtree", random_state=rng, verbosity=0)


# Define objective with the model
def objective_w_model(trial):
    # Set estimator parameters
    model.set_params(
        n_estimators=trial.suggest_int("n_estimators", 30, 500),
        eta=trial.suggest_float("eta", 1e-4, 3, log=True),
        subsample=trial.suggest_float("subsmaple", 0.1, 1),
        max_depth=trial.suggest_int("max_depth", 2, 30),
        min_child_weight=trial.suggest_float("min_child_weight", 0, 50),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1),
        scale_pos_weight=trial.suggest_float("scale_pos_weight", 1e-4, 2),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 10),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 10),
    )
    return cv_pipe(trial, model)

In [None]:
# Define unique identifier of the study
study_name = "XGBoost"

study = optuna.create_study(
    storage=storage_name,
    study_name=study_name,
    sampler=optuna.samplers.TPESampler(seed=seed),
    direction="maximize",
    load_if_exists=True,
)
print(f"Sampler: {study.sampler.__class__.__name__}")


In [None]:
# Perform hyperparameter optimization search
study.optimize(
    objective_w_model,
    n_trials=700,
    timeout=3600,
    catch=(ValueError),
    # callbacks=[logging_callback],
)


In [None]:
top_scores = get_top_scores(study)
top_scores


In [None]:
model_test_score(model, study_name, top_scores, model_results)

In [None]:
submissions_from_model(model, study_name, top_scores, model_results)


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

In [None]:
model_results

In [None]:
model_results["mean_cv_score"] = model_results.filter(regex=r"^cv_").mean(axis=1)
model_results["std_cv_score"] = model_results.filter(regex=r"^cv_").std(axis=1)
model_results["mean_test_score"] = model_results.filter(regex=r"^test_").mean(axis=1)
model_results["std_test_score"] = model_results.filter(regex=r"^test_").std(axis=1)
model_results["mean_duration"] = model_results.filter(regex=r"^duration_").mean(axis=1)
model_results.drop(
    columns=model_results.filter(regex=r"^duration_").columns, inplace=True
)


In [None]:
# mean_diff = []
# for i in range(1, len(model_results) + 1):
#     mean_diff.append(
#         (
#             model_results.filter(regex=r"^test_").T.reset_index().iloc[:, i]
#             - model_results.filter(regex=r"^cv_").T.reset_index().iloc[:, i]
#         ).mean()
#     )
# model_results["mean_diff"] = mean_diff

In [None]:
model_results.sort_values(by="mean_test_score", ascending=False, inplace=True)

In [None]:
model_results

In [None]:
break


In [None]:
# Define the objective of the study without the model
def cv_pipe(trial, model):
    # Create Pipeline
    pipe = make_pipeline(
        age_imputer,
        SelectPercentile(
            mut_info, percentile=trial.suggest_int("MI_percentile", 1, 100)
        ),
        standardize,
        cleanup,
        # SelectFromPI(percentile=trial.suggest_int("perm_perc", 0, 100)),
        # PCA(n_components=trial.suggest_float('PCA_components', 1e-2, 1)),  # float? variance explained is greater than percentage
        model,
    )
    if trial.suggest_categorical("reciprocal", [True, False]):
        pipe.steps.insert(
            2,
            (
                "reciprocal",
                inverse,
            ),
        )
    if trial.suggest_categorical("selection_l1_C", [True, False]):
        pipe.steps.insert(
            -1,
            (
                "l1_selection",
                SelectFromModel(
                    LinearSVC(
                        C=trial.suggest_float("l1_C", 1e-2, 20, log=True),
                        penalty="l1",
                        dual=False,
                        max_iter=100000,
                        random_state=seed,
                    )
                ),
            ),
        )
    # if trial.suggest_categorical("selection_RF_FI", [True, False]):
    #     pipe.steps.insert(
    #         -1,
    #         (
    #             "ET_selection",
    #             SelectFromModel(
    #                 ExtraTreesClassifier(
    #                     random_state=seed,
    #                 ),
    #                 threshold=f"{trial.suggest_float('ET_threshold', 1e-2, 1)}*mean",
    #             ),
    #         ),
    #     )
    # Calculate scoring metric
    cv_score = cross_val_score(
        pipe,
        X_train,
        y_train,
        cv=RepeatedStratifiedKFold(n_splits=7, n_repeats=2, random_state=seed),
        n_jobs=-1,
    ).mean()
    return cv_score


##### Logistic Regression

In [None]:
rng = np.random.RandomState(seed)
# Initialize model
model = LogisticRegression(
    max_iter=100000,
    n_jobs=-1,
    random_state=rng,
)


# Define objective with the model
def objective_w_model(trial):
    # Set estimator parameters
    model.set_params(
        C=trial.suggest_float("C", 1e-4, 1000, log=True),
        solver=trial.suggest_categorical(
            "solver", ["liblinear", "lbfgs", "newton-cg", "sag", "saga"]
        ),
    )
    return cv_pipe(trial, model)


In [None]:
# Define unique identifier of the study
study_name = "Logistic_Regression"

study = optuna.create_study(
    storage=storage_name,
    study_name=study_name,
    sampler=optuna.samplers.TPESampler(multivariate=True, seed=seed),
    direction="maximize",
    load_if_exists=True,
)
print(f"Sampler: {study.sampler.__class__.__name__}")


In [None]:
# Perform hyperparameter optimization search
study.optimize(
    objective_w_model,
    n_trials=700,
    timeout=3600,
    catch=(ValueError),
    # callbacks=[logging_callback],
)


In [None]:
top_scores = get_top_scores(study)
top_scores


In [None]:
model_test_score(model, study_name, top_scores, model_results)


In [None]:
submissions_from_model(model, study_name, top_scores, model_results)


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
# optuna.visualization.plot_parallel_coordinate(study)

In [None]:
# optuna.visualization.plot_contour(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_param_importances(
    study, target=lambda t: t.duration.total_seconds(), target_name="duration"
)

In [None]:
break

In [None]:
class SelectFromPI:
    def __init__(self, percentile):
        self.percentile = percentile
        self.selector = SelectPercentile(
            score_func=self.permutation_importance_from_model,
            percentile=self.percentile,
        )

    @staticmethod
    def permutation_importance_from_model(X, y):
        model.fit(X, y)
        return permutation_importance(model, X, y, random_state=seed).importances_mean

    def fit(self, X, y):
        self.selector.fit(X, y)

        # Check if no features selected
        # if self.selector.get_support().sum() == 0:
        # Select the best feature based on the provided score function
        # best_feature_idx = self.selector.scores_.argmax()
        # print(self.selector.scores_)
        # print(best_feature_idx)
        # self.selector.get_support()[best_feature_idx] = True
        return self

    def transform(self, X):
        # Check if no features selected
        if self.selector.get_support().sum() == 0:
            # Select the best feature based on the provided score function
            best_feature_idx = self.selector.scores_.argmax()
            return pd.DataFrame(X).iloc[:, [best_feature_idx]]
        return self.selector.transform(X)


In [None]:
perm_rf = permutation_importance(
    rf_fit, X, y, random_state=seed, n_repeats=3, n_jobs=-1
)
perm_pipe = permutation_importance(
    pipe_fit, X, y, random_state=seed, n_repeats=3, n_jobs=-1
)


In [None]:
perm_imp = (
    pd.DataFrame(perm_pipe.importances, index=X.columns)
    .reset_index()
    .melt(id_vars="index")
)


In [None]:
sns.barplot(
    perm_imp,
    x="value",
    y="index",
    order=perm_imp.groupby("index")["value"].mean().sort_values(ascending=False).index,
)


In [None]:
sns.barplot(
    x=perm_pipe.importances_mean,
    y=X.columns,
    order=[
        item[1]
        for item in sorted(zip(perm_pipe.importances_mean, X.columns), reverse=True)
    ],
)


In [None]:
val_key = sorted(zip(randf.feature_importances_, randf.feature_names_in_), reverse=True)
val_key = pd.DataFrame(val_key)


In [None]:
sns.barplot(x=val_key[0], y=val_key[1])


In [None]:
val_key = sorted(zip(model.feature_importances_), reverse=True)
val_key = pd.DataFrame(val_key)


In [None]:
# val_key


In [None]:
sns.barplot(x=val_key[0], y=np.arange(val_key.shape[0]).astype(str))


In [None]:
model.n_features_in_
