In [1]:
# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Data Handling and Processing
import numpy as np
import pandas as pd
import math
from sklearn.impute import KNNImputer
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import viztoolz as viz
import mltoolz as mlt
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Model Selection, Metrics & Evaluation
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE, SelectPercentile, mutual_info_classif
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier, VotingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Pipeline Construction 
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Model Handling
import joblib
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

In [2]:
train = pd.read_csv('../data/raw/train.csv')

In [3]:
# Transform PassengerId into InGroup and GroupSize
class TransformPassengerId(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X['GroupId'] = X['PassengerId'].str.split('_').str[0]
        X['PassengerNumber'] = X['PassengerId'].str.split('_').str[1].astype(float)
        group_counts = X['GroupId'].value_counts()
        X['GroupSize'] = X['GroupId'].map(group_counts)
        X['InGroup'] = np.where(X['GroupSize'] > 1, 1, 0)
        return X


# Transform Cabin into Deck, CabinPosition and Side 
class TransformCabin(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X['Deck'] = X['Cabin'].str.split('/').str[0]
        X['CabinNumber'] = X['Cabin'].str.split('/').str[1].astype(float)
        X['Side'] = X['Cabin'].str.split('/').str[2]
        bin_edges = np.linspace(X['CabinNumber'].min(), X['CabinNumber'].max(), 5)
        X['CabinPosition'] = pd.cut(X['CabinNumber'],
                                    bins=bin_edges,
                                    labels=['Front','Second','Third','Back'],
                                    include_lowest=True)
        return X


# Impute HomePlanet
class ImputeHomePlanet(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        group_modes = X.groupby('GroupId')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
        X.loc[X['HomePlanet'].isna(), 'HomePlanet'] = group_modes[X['HomePlanet'].isna()]

        deck_modes = X.groupby('Deck')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
        X.loc[X['HomePlanet'].isna(), 'HomePlanet'] = deck_modes[X['HomePlanet'].isna()]

        if 'VIP' in X.columns:
            vip_mode_homePlanet = X.loc[X['VIP'] == True, 'HomePlanet'].mode().iloc[0]
            X.loc[X['VIP'] & X['HomePlanet'].isna(), 'HomePlanet'] = vip_mode_homePlanet

        X['HomePlanet'].fillna(X['HomePlanet'].mode().iloc[0], inplace=True)

        return X


# Create proportional imputer and impute
class ProportionalImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        impute_cols = ['Destination', 'Deck', 'Side', 'CabinPosition', 'VIP', 'CryoSleep']
        for col in impute_cols:
            proportions = X.groupby('HomePlanet')[col].value_counts(normalize=True)

            def impute_values(row):
                if pd.isna(row[col]):
                    group = row['HomePlanet']
                    if pd.notna(group) and group in proportions.index:
                        group_proportions = proportions.loc[group].dropna()
                        return np.random.choice(group_proportions.index, p=group_proportions.values)
                return row[col]
        
            X[col] = X.apply(impute_values, axis=1)
        return X


# KNN Imputer and transformer
class KNNImputerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        self.imputer = KNNImputer(n_neighbors=5)
        self.imputer.fit(X[self.columns])
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X[self.columns] = self.imputer.transform(X[self.columns])
        return X


# Create TotalSpent column
class CreateTotalSpent(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X['TotalSpent'] = X[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
        return X


# Convert binary classes to int 
class ToInt(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        cols = ['InGroup', 'CryoSleep', 'VIP', 'Transported']
        for col in cols:
            if col in X.columns:
                X[col] = X[col].astype(int)
        return X


# Drop unwanted columns
class DropColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        droppers = ['PassengerNumber', 'GroupId', 'Cabin', 'CabinNumber', 'Name']
        X.drop(columns=droppers, inplace=True, errors='ignore')
        return X

In [6]:
target = 'Transported'

train = shuffle(train, random_state=42)
train, val = train_test_split(train, test_size=0.3, stratify=train[target], random_state=42)

X_train, y_train = train.drop(target, axis=1), train[target]
X_val, y_val = val.drop(target, axis=1), val[target]

---
## What about a stacking model?

could also try the final estimator as a voting classifier

In [7]:
rf_tuned = joblib.load('../models/rf_tuned.joblib')
lgbm_tuned = joblib.load('../models/lgbm_tuned.joblib')
xgbm_tuned = joblib.load('../models/xgbm_tuned.joblib')

In [28]:
droppers = ['PassengerNumber', 'GroupId', 'Cabin', 'CabinNumber', 'Name']
num_cols = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpent']
cat_cols = ['HomePlanet','Destination','Deck','Side','CabinPosition','GroupSize']
bin_cols = ['InGroup','CryoSleep','VIP']
droppers = ['PassengerNumber', 'GroupId', 'Cabin', 'CabinNumber', 'Name']


preprocess = Pipeline([
    ('TransformPassengerId', TransformPassengerId()),
    ('TransformCabin', TransformCabin()),
    ('ImputeHomePlanet', ImputeHomePlanet()),
    ('ProportionalImputer', ProportionalImputer()),
    ('KNNImputer', KNNImputerTransformer()),
    ('CreateTotalSpent', CreateTotalSpent()),
    ('ToInt', ToInt()),
    ('DropColumns', DropColumns())
])

ensure_dataframe = FunctionTransformer(lambda X: pd.DataFrame(X, columns=X_train.columns) if not isinstance(X, pd.DataFrame) else X)

preprocess_transformer = ColumnTransformer(
    transformers=[
        ('make_dataframe', ensure_dataframe, slice(None)),
        ('preprocess_pipeline', preprocess, slice(None))
])

scaling = Pipeline([
    ('power', PowerTransformer(method='yeo-johnson')),
    ('scale', StandardScaler())
])

manipulator = ColumnTransformer(force_int_remainder_cols=False,
        transformers=[
                ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
                ('transformScale', scaling, num_cols),
                ('binary_pass', 'passthrough', bin_cols)],
                remainder = 'passthrough')

lr_manipulator = ColumnTransformer(force_int_remainder_cols=False,
        transformers=[
                ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
                ('transformScale', scaling, num_cols + bin_cols),
                #('drop', 'drop', droppers)
                ],
                remainder = 'passthrough')

rfe = RFE(estimator=RandomForestClassifier(random_state=42, n_estimators=100), n_features_to_select=8)
select_percentile = SelectPercentile(mutual_info_classif, percentile=50)

pipe = Pipeline([
    ('process', preprocess),
    ('mainpulate', manipulator),
    ('rfe', rfe),
    ('model', _)
])

In [29]:
xgbm_pipe = Pipeline([
    ('preprocess_transform', preprocess_transformer),
    ('mainpulate', manipulator),
    ('model', xgbm_tuned)
])

lgbm_pipe = Pipeline([
    ('preprocess_transform', preprocess_transformer),
    ('mainpulate', manipulator),
    ('model', lgbm_tuned)
])

rf_pipe = Pipeline([
    ('preprocess_transform', preprocess_transformer),
    ('mainpulate', manipulator),
    ('model', rf_tuned)
])

In [30]:
sc = StackingClassifier(
    estimators = [
        ('rf_tuned', rf_pipe),
        ('xgbm_tuned', xgbm_pipe)],
        final_estimator = lgbm_tuned)

sc_pipe = Pipeline([
    ('preprocess_transform', preprocess_transformer),
    ('mainpulate', manipulator),
    ('clf', sc)
])

In [31]:
mlt.cv_evaluate(sc_pipe, X_train, y_train, scoring=['balanced_accuracy','f1','roc_auc'], print_scores=False)

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_indexing.py", line 338, in _get_column_indices
    all_columns = X.columns
                  ^^^^^^^^^
AttributeError: 'numpy.ndarray' object has no attribute 'columns'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 968, in fit_transform
    self._validate_column_callables(X)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 536, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_indexing.py", line 340, in _get_column_indices
    raise ValueError(
ValueError: Specifying the columns using strings is only supported for dataframes.


In [None]:
vc = VotingClassifier(
    estimators=[
        ('rf_tuned', rf_tuned),
        ('lgbm_tuned', xgbm_tuned)
        ('xgbm_tuned', xgbm_tuned)],
        voting='soft')

sc = StackingClassifier(
    estimators = [
        ('rf_tuned', 'rf_tuned'),
        ('lgbm_tuned', 'xgbm_tuned')],
        final_estimator = vc)

In [None]:
mlt.cv_evaluate(pipe, X_train, y_train, scoring=['balanced_accuracy','f1','roc_auc'], print_scores=False)