In [9]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.impute import KNNImputer

In [3]:
# Transform PassengerId into InGroup and GroupSize
class TransformPassengerId(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X['GroupId'] = X['PassengerId'].str.split('_').str[0]
        X['PassengerNumber'] = X['PassengerId'].str.split('_').str[1].astype(float)
        group_counts = X['GroupId'].value_counts()
        X['GroupSize'] = X['GroupId'].map(group_counts)
        X['InGroup'] = np.where(X['GroupSize'] > 1, 1, 0)
        return X


# Transform Cabin into Deck, CabinPosition and Side 
class TransformCabin(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X['Deck'] = X['Cabin'].str.split('/').str[0]
        X['CabinNumber'] = X['Cabin'].str.split('/').str[1].astype(float)
        X['Side'] = X['Cabin'].str.split('/').str[2]
        bin_edges = np.linspace(X['CabinNumber'].min(), X['CabinNumber'].max(), 5)
        X['CabinPosition'] = pd.cut(X['CabinNumber'],
                                    bins=bin_edges,
                                    labels=['Front','Second','Third','Back'],
                                    include_lowest=True)
        return X


# Impute HomePlanet
class ImputeHomePlanet(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        group_modes = X.groupby('GroupId')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
        X.loc[X['HomePlanet'].isna(), 'HomePlanet'] = group_modes[X['HomePlanet'].isna()]

        deck_modes = X.groupby('Deck')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
        X.loc[X['HomePlanet'].isna(), 'HomePlanet'] = deck_modes[X['HomePlanet'].isna()]

        if 'VIP' in X.columns:
            vip_mode_homePlanet = X.loc[X['VIP'] == True, 'HomePlanet'].mode().iloc[0]
            X.loc[X['VIP'] & X['HomePlanet'].isna(), 'HomePlanet'] = vip_mode_homePlanet

        X['HomePlanet'].fillna(X['HomePlanet'].mode().iloc[0], inplace=True)

        return X


# Create proportional imputer and impute
class ProportionalImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        impute_cols = ['Destination', 'Deck', 'Side', 'CabinPosition', 'VIP', 'CryoSleep']
        for col in impute_cols:
            proportions = X.groupby('HomePlanet')[col].value_counts(normalize=True)

            def impute_values(row):
                if pd.isna(row[col]):
                    group = row['HomePlanet']
                    if pd.notna(group) and group in proportions.index:
                        group_proportions = proportions.loc[group].dropna()
                        return np.random.choice(group_proportions.index, p=group_proportions.values)
                return row[col]
        
            X[col] = X.apply(impute_values, axis=1)
        return X


# KNN Imputer and transformer
class KNNImputerTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        self.imputer = KNNImputer(n_neighbors=5)
        self.imputer.fit(X[self.columns])
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X[self.columns] = self.imputer.transform(X[self.columns])
        return X


# Create TotalSpent column
class CreateTotalSpent(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        X['TotalSpent'] = X[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
        return X


# Convert binary classes to int 
class ToInt(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        cols = ['InGroup', 'CryoSleep', 'VIP', 'Transported']
        for col in cols:
            if col in X.columns:
                X[col] = X[col].astype(int)
        return X


# Drop unwanted columns
class DropColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        droppers = ['PassengerNumber', 'GroupId', 'Cabin', 'CabinNumber', 'Name']
        X.drop(columns=droppers, inplace=True, errors='ignore')
        return X

In [14]:
def transform_impute(X):
    transform_pipe = Pipeline([
        ('TransformPassengerId', TransformPassengerId()),
        ('TransformCabin', TransformCabin()),
        ('ImputeHomePlanet', ImputeHomePlanet()),
        ('ProportionalImputer', ProportionalImputer()),
        ('KNNImputer', KNNImputerTransformer()),
        ('CreateTotalSpent', CreateTotalSpent()),
        ('ToInt', ToInt()),
        ('DropColumns', DropColumns())
    ])
    return transform_pipe.fit_transform(X)

prep_pipe = FunctionTransformer(transform_impute)

num_cols = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpent']
cat_cols = ['HomePlanet','Destination','Deck','Side','CabinPosition','GroupSize']
bin_cols = ['InGroup','CryoSleep','VIP']

num_cols_pipe = Pipeline([
        ('power', PowerTransformer(method='yeo-johnson')),
        ('scaler', StandardScaler())
])

cat_cols_pipe = Pipeline([
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

manipulator = ColumnTransformer([
    ('numeric_transforms', num_cols_pipe, num_cols),
    ('cat_transforms', cat_cols_pipe, cat_cols),
    ('binary_pass', 'passthrough', bin_cols)
])

full_pipe = Pipeline([
    ('transform_impute', prep_pipe),
    ('col_transform', manipulator),
    ('model', LogisticRegression(max_iter=1000))
])

In [7]:
train = pd.read_csv('../data/raw/train.csv')

target = 'Transported'

train = shuffle(train, random_state=42)
train, val = train_test_split(train, test_size=0.3, stratify=train[target], random_state=42)

X_train, y_train = train.drop(target, axis=1), train[target]
X_val, y_val = val.drop(target, axis=1), val[target]

In [12]:
full_pipe.fit(X_train, y_train)

In [13]:
full_pipe.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('transform_impute',
                 FunctionTransformer(func=<function transform_impute at 0x1294bd580>)),
                ('col_tranform',
                 ColumnTransformer(transformers=[('numeric_transforms',
                                                  Pipeline(steps=[('power',
                                                                   PowerTransformer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'RoomService',
                                                   'FoodCourt', 'ShoppingMall',
                                                   'Spa', 'VRDeck',
                                                   'TotalSpent']),
                                                 ('cat_transforms',
                                     

In [15]:
from catboost import CatBoostClassifier

In [18]:
class CatBoostWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols, **catboost_params):
        self.cat_cols = cat_cols
        self.catboost_params = catboost_params
        self.model = CatBoostClassifier(**catboost_params)

    def fit(self, X, y=None):
        self.cat_features_indices = [i for i, col in enumerate(X.columns) if any(cat in col for cat in self.cat_cols)]
        self.model.set_params(cat_features=self.cat_features_indices)
        self.mode.fit(X, y)
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)

In [19]:
catb_manipulator = ColumnTransformer([
    ('numeric_transforms', num_cols_pipe, num_cols),
    ('ohe_pass', 'passthrough', cat_cols),
    ('binary_pass', 'passthrough', bin_cols)
])

catb_full_pipe = Pipeline([
    ('transform_impute', prep_pipe),
    ('col_transform', catb_manipulator),
    ('model', CatBoostWrapper(cat_cols=cat_cols))
])


In [21]:
def get_catboost_cols(X):
    transformed_features = catb_manipulator.get_feature_names_out()
    cat_idxs = [i for i, col in enumerate(transformed_features) if any(cat in col for cat in cat_cols)]
    return cat_idxs

def to_dataframe(X, columns):
    return pd.DataFrame(X, columns=columns)

In [20]:
catb_full_pipe.fit(X_train, y_train)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'