In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
def create_countplot(
    df, ax, col, colors, plot_xlabel="", plot_ylabel="", percent=False
):
    """
    Creates a vertical countplot with the provided percentages on top.
    """
    counts = df[col].value_counts()
    category_labels = [str(cat) for cat in counts.index]
    bars = ax.bar(category_labels, counts.values, color=colors)

    ax.set_xlabel(plot_xlabel)
    ax.set_ylabel(plot_ylabel)
    ax.tick_params(axis="both", which="both", length=0)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    ax.grid(False)

    for spine in ax.spines.values():
        spine.set_color("black")

    if percent:
        total_height = counts.sum()
        for bar in bars:
            height = bar.get_height()
            ax.annotate(
                f"{height/total_height:.1%}",
                (bar.get_x() + bar.get_width() / 2, height),
                ha="center",
                va="bottom",
                fontsize=10,
                color="black",
                xytext=(0, 2),
                textcoords="offset points",
            )

    ax.set_title(ax.get_title(), y=0.5)

In [6]:
def create_stacked_barchart(
    df, ax, col_x, col_hue, colors, plot_xlabel="", plot_ylabel="", percent=False
):
    """
    Creates a stacked bar plot with the provided percentages on top.
    """
    counts = pd.crosstab(df[col_x], df[col_hue], normalize="index")
    bars = counts.plot(kind="bar", stacked=True, color=colors, ax=ax)

    ax.set_xlabel(plot_xlabel)
    ax.set_ylabel(plot_ylabel)
    ax.tick_params(axis="x", which="both", length=0)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    ax.grid(False)
    ax.legend_ = None

    for spine in ax.spines.values():
        spine.set_color("black")

    if percent:
        for bar in bars.patches:
            width, height = bar.get_width(), bar.get_height()
            x, y = bar.get_xy()
            ax.annotate(
                f"{height:.1%}",
                (x + width / 2, y + height / 2),
                ha="center",
                va="center",
                fontsize=10,
                color="white",
                xytext=(0, 2),
                textcoords="offset points",
            )

    ax.set_title(ax.get_title(), y=0.5)

In [8]:
class SplitCabin(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        cabin_split = X["Cabin"].str.split("/", expand=True)
        X["CabinDeck"] = cabin_split[0]
        X["CabinNumber"] = cabin_split[1]
        X["Side"] = cabin_split[2]
        return X

In [10]:
class GroupFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["GroupNumber"] = (
            X["PassengerId"].str.split("_", expand=True)[0].astype("category")
        )
        X["GroupSize"] = X.groupby("GroupNumber")["PassengerId"].transform("count")
        X["TravellingSolo"] = X["GroupSize"] < 2
        return X

In [12]:
class ExpenditureFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["TotalExpenditure"] = X[
            ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
        ].sum(axis=1)
        X["ServiceExpenditure"] = X[["RoomService", "Spa", "VRDeck"]].sum(axis=1)
        X["ShoppingExpenditure"] = X[["ShoppingMall", "FoodCourt"]].sum(axis=1)
        X["NoSpending"] = X["TotalExpenditure"] == 0
        return X

In [14]:
class ExtractSurname(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["Surname"] = X["Name"].str.split(" ").str[-1]
        X = X.drop(columns=["Name"])
        return X

In [16]:
class AgeBinning(BaseEstimator, TransformerMixin):
    def __init__(self, bins, labels):
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["AgeGroup"] = pd.cut(
            X["Age"], bins=self.bins, labels=self.labels, right=False
        )
        return X

In [18]:
class MissingDataRules(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Columns to update
        columns_to_update = [
            "RoomService",
            "FoodCourt",
            "ShoppingMall",
            "Spa",
            "VRDeck",
            "TotalExpenditure",
            "ShoppingExpenditure",
            "ServiceExpenditure",
        ]

        # Where CryoSleep is True, spending is 0
        df.loc[df["CryoSleep"] == True, columns_to_update] = 0

        # Where NoSpending is False, CryoSleep is False
        df.loc[df["NoSpending"] == False, "CryoSleep"] = df.loc[
            df["NoSpending"] == False, "CryoSleep"
        ].fillna(False)

        # Where Age < 13, spending is 0
        df.loc[df["Age"] < 13, columns_to_update] = df.loc[
            df["Age"] < 13, columns_to_update
        ].fillna(0)

        # HomePlanet is the same for each GroupNumber
        most_common_homeplanet = df.groupby("GroupNumber")["HomePlanet"].apply(
            lambda x: x.mode()[0] if not x.mode().empty else None
        )
        df["HomePlanet"] = df.apply(
            lambda row: (
                most_common_homeplanet[row["GroupNumber"]]
                if pd.isna(row["HomePlanet"])
                else row["HomePlanet"]
            ),
            axis=1,
        )

        # HomePlanet rules based on CabinDeck
        df.loc[
            df["CabinDeck"].isin(["A", "B", "C", "T"]) & df["HomePlanet"].isna(),
            "HomePlanet",
        ] = "Europa"
        df.loc[df["CabinDeck"] == "G", "HomePlanet"] = "Earth"

        # HomePlanet is the same for each Surname
        most_common_homeplanet = df.groupby("Surname")["HomePlanet"].apply(
            lambda x: x.mode()[0] if not x.mode().empty else None
        )
        df["HomePlanet"] = df.apply(
            lambda row: (
                most_common_homeplanet[row["Surname"]]
                if pd.notna(row["Surname"]) and pd.isna(row["HomePlanet"])
                else row["HomePlanet"]
            ),
            axis=1,
        )

        # Drop the Surname
        df = df.drop(columns=["Surname"])

        # Where Age is less than 19 or/and HomePlanet is Earth, VIP is False
        df.loc[(df["VIP"].isna()) & (df["Age"] < 19), "VIP"] = False
        df.loc[df["HomePlanet"] == "Earth", "VIP"] = False

        # Side is the same for each GroupNumber
        most_common_side = df.groupby("GroupNumber")["Side"].apply(
            lambda x: x.mode()[0] if not x.mode().empty else None
        )
        df["Side"] = df.apply(
            lambda row: (
                most_common_side[row["GroupNumber"]]
                if pd.isna(row["Side"])
                else row["Side"]
            ),
            axis=1,
        )

        return df

In [20]:
class NumericalDataInput(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop, numerical_features, k=5):
        self.columns_to_drop = columns_to_drop
        self.numerical_features = numerical_features
        self.k = k
        self.scaler = StandardScaler()
        self.imputer = KNN(k=self.k)

    def fit(self, X, y=None):
        # Fit the scaler to the numerical features
        self.scaler.fit(X[self.numerical_features])
        return self

    def transform(self, X):
        # Drop unwanted columns
        X = X.drop(columns=self.columns_to_drop)

        # Scale the numerical features
        X[self.numerical_features] = self.scaler.transform(X[self.numerical_features])

        # Suppress output during imputation
        old_stdout = sys.stdout
        sys.stdout = open(os.devnull, "w")

        # Impute missing values using KNN
        X[self.numerical_features] = self.imputer.fit_transform(
            X[self.numerical_features]
        )

        sys.stdout = old_stdout

        # Inverse scale the features
        X[self.numerical_features] = self.scaler.inverse_transform(
            X[self.numerical_features]
        )

        return X

In [22]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.imputer = SimpleImputer(strategy="most_frequent")

    def fit(self, X, y=None):
        # Fit the imputer on the categorical columns
        self.imputer.fit(X[self.cat_cols])
        return self

    def transform(self, X):
        # Replace None with np.nan and apply the imputer
        X = X.replace({None: np.nan})
        X[self.cat_cols] = self.imputer.transform(X[self.cat_cols])
        return X

In [24]:
class CleanFinalVersion(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop, new_column_order):
        self.columns_to_drop = columns_to_drop
        self.new_column_order = new_column_order

    def fit(self, X, y=None):
        # No fitting is needed, so just return self
        return self

    def transform(self, X):
        # Drop the specified columns and reorder the remaining ones
        X = X.drop(columns=self.columns_to_drop)
        X = X.reindex(columns=self.new_column_order)
        return X

In [26]:
def calculate_confidence_interval(data):
    mean = data.mean()
    n = len(data)
    standard_error = stats.sem(data)
    margin_of_error = 1.96 * standard_error
    lower_ci, upper_ci = (mean - margin_of_error), (mean + margin_of_error)
    return mean, lower_ci, upper_ci