In [1]:
# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Data Handling and Processing
import numpy as np
import pandas as pd
import math
from sklearn.impute import KNNImputer
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, PowerTransformer
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import viztoolz as viz
import mltoolz as mlt
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Model Selection, Metrics & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Pipeline Construction 
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

In [2]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

print('-'*16)
print(f'Train Set Shape:\n{train.shape}')
print('-'*16)
print(f'Test Set Shape:\n{test.shape}')
print('-'*16)

----------------
Train Set Shape:
(8693, 14)
----------------
Test Set Shape:
(4277, 13)
----------------


In [3]:
mlt.describe_and_suggest(train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns)
dtypes: object(7), float64(6), bool(1)
memory usage: 3461.6 KB

Total Percentage of Null Values: 26.73%


Unnamed: 0,Data Type,Not-Null,Missing,Missing (%),Unique,Cardinality (%),Suggested Type
PassengerId,object,8693,0,0.0,8693,100.0,Categorical
HomePlanet,object,8492,201,2.31,3,0.03,Categorical
CryoSleep,object,8476,217,2.5,2,0.02,Binary
Cabin,object,8494,199,2.29,6560,75.46,Categorical
Destination,object,8511,182,2.09,3,0.03,Categorical
Age,float64,8514,179,2.06,80,0.92,Numerical Discrete
VIP,object,8490,203,2.34,2,0.02,Binary
RoomService,float64,8512,181,2.08,1273,14.64,Numerical Continuous
FoodCourt,float64,8510,183,2.11,1507,17.34,Numerical Continuous
ShoppingMall,float64,8485,208,2.39,1115,12.83,Numerical Continuous


In [32]:
# Transform 'PassengerId'
def transform_passengerId(df):
    df['GroupId'] = df['PassengerId'].str.split('_').str[0]
    df['PassengerNumber'] = df['PassengerId'].str.split('_').str[1].astype(float)
    group_counts = df['GroupId'].value_counts()
    df['GroupSize'] = df['GroupId'].map(group_counts)
    df['InGroup'] = np.where(df['GroupSize'] > 1, 1, 0)
    return df

# Transform 'Cabin'
def transform_Cabin(df):
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['CabinNumber'] = df['Cabin'].str.split('/').str[1].astype(float)
    df['Side'] = df['Cabin'].str.split('/').str[2]
    bin_edges = np.linspace(df['CabinNumber'].min(), df['CabinNumber'].max(), 5)
    df['CabinPosition'] = pd.cut(df['CabinNumber'],
                                 bins=bin_edges,
                                 labels=['Front','Second','Third','Back'],
                                 include_lowest=True)
    return df

# Imputations for NaNs in 'HomePlanet'
def impute_homePlanet(df):
    group_modes = df.groupby('GroupId')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = group_modes[df['HomePlanet'].isna()]

    deck_modes = df.groupby('Deck')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = deck_modes[df['HomePlanet'].isna()]

    if 'VIP' in df.columns:
        vip_mode_homePlanet = df.loc[df['VIP'] == True, 'HomePlanet'].mode().iloc[0]
        df.loc[df['VIP'] & df['HomePlanet'].isna(), 'HomePlanet'] = vip_mode_homePlanet

    df['HomePlanet'].fillna(df['HomePlanet'].mode().iloc[0], inplace=True)

    return df

# Proportional imputer for categorical columns
def proportional_imputer(df, impute_cols):
    for col in impute_cols:
        proportions = df.groupby('HomePlanet')[col].value_counts(normalize=True)

        def impute_values(row):
            if pd.isna(row[col]):
                group = row['HomePlanet']
                if pd.notna(group) and group in proportions.index:
                    group_proportions = proportions.loc[group].dropna()
                    return np.random.choice(group_proportions.index, p=group_proportions.values)
            return row[col]
        
        # Apply the impute function to each column
        df[col] = df.apply(impute_values, axis=1)
    return df

# KNN imputation for numerical columns
def knn_imputer(df, columns):
    imputer = KNNImputer(n_neighbors=5)
    df[columns] = imputer.fit_transform(df[columns])
    return df

# Create 'TotalSpent' feature
def create_totalSpent(df):
    df['TotalSpent'] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
    return df

# Convert specific columns to integers
def convert_to_int(df):
    for col in ['InGroup', 'CryoSleep', 'VIP', 'Transported']:
        if col in df.columns:
            df[col] = df[col].astype(int)
    return df

# Drop unwanted columns
def drop_cols(df):
    droppers = ['PassengerNumber','GroupId','Cabin','CabinNumber','Name']
    df.drop(droppers, axis=1, inplace=True)
    return df

# Main function to process DataFrame in order
def process_dataframe(df):
    df = transform_passengerId(df)
    df = transform_Cabin(df)
    df = impute_homePlanet(df)
    df = proportional_imputer(df, impute_cols=['Destination', 'Deck', 'Side', 'CabinPosition', 'VIP', 'CryoSleep'])
    df = knn_imputer(df, columns=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])
    df = create_totalSpent(df)
    df = convert_to_int(df)
    return df

In [5]:
process_dataframe(train)
mlt.describe_and_suggest(train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns)
dtypes: float64(7), object(6), int64(5)
memory usage: 3568.5 KB

Total Percentage of Null Values: 0.00%


Unnamed: 0,Data Type,Not-Null,Missing,Missing (%),Unique,Cardinality (%),Suggested Type
PassengerId,object,8693,0,0.0,8693,100.0,Categorical
HomePlanet,object,8693,0,0.0,3,0.03,Categorical
CryoSleep,int64,8693,0,0.0,2,0.02,Binary
Destination,object,8693,0,0.0,3,0.03,Categorical
Age,float64,8693,0,0.0,131,1.51,Numerical Discrete
VIP,int64,8693,0,0.0,2,0.02,Binary
RoomService,float64,8693,0,0.0,1344,15.46,Numerical Continuous
FoodCourt,float64,8693,0,0.0,1566,18.01,Numerical Continuous
ShoppingMall,float64,8693,0,0.0,1180,13.57,Numerical Continuous
Spa,float64,8693,0,0.0,1396,16.06,Numerical Continuous


In [55]:
# Transform 'PassengerId'
def transform_passengerId(df):
    df['GroupId'] = df['PassengerId'].str.split('_').str[0]
    df['PassengerNumber'] = df['PassengerId'].str.split('_').str[1].astype(float)
    group_counts = df['GroupId'].value_counts()
    df['GroupSize'] = df['GroupId'].map(group_counts)
    df['InGroup'] = np.where(df['GroupSize'] > 1, 1, 0)
    return df

# Transform 'Cabin'
def transform_Cabin(df):
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['CabinNumber'] = df['Cabin'].str.split('/').str[1].astype(float)
    df['Side'] = df['Cabin'].str.split('/').str[2]
    bin_edges = np.linspace(df['CabinNumber'].min(), df['CabinNumber'].max(), 5)
    df['CabinPosition'] = pd.cut(df['CabinNumber'],
                                 bins=bin_edges,
                                 labels=['Front','Second','Third','Back'],
                                 include_lowest=True)
    return df

# Imputations for NaNs in 'HomePlanet'
def impute_homePlanet(df):
    group_modes = df.groupby('GroupId')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = group_modes[df['HomePlanet'].isna()]

    deck_modes = df.groupby('Deck')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = deck_modes[df['HomePlanet'].isna()]

    vip_mode_homePlanet = df.loc[df['VIP'] == True, 'HomePlanet'].mode().iloc[0]
    df.loc[df['VIP'] & df['HomePlanet'].isna(), 'HomePlanet'] = vip_mode_homePlanet

    df['HomePlanet'].fillna(df['HomePlanet'].mode().iloc[0], inplace=True)

    return df

# Create a proportional imputer
def proportional_imputer(df, impute_cols):
    for col in impute_cols:
        proportions = df.groupby('HomePlanet')[col].value_counts(normalize=True)

        def impute_values(row):
            if pd.isna(row[col]):
                group = row['HomePlanet']
                if pd.notna(group) and group in proportions.index:
                    group_proportions = proportions.loc[group].dropna()
                    return np.random.choice(group_proportions.index, p=group_proportions.values)
                return row[col]
            
        df[col] = df.apply(impute_values, axis=1)
    return df

def knn_imputer(df, columns):
    imputer = KNNImputer(n_neighbors=5)
    df[columns] = imputer.fit_transform(df[columns])
    return df

# Create a 'TotalSPent' feature
def create_totalSpent(df):
    df['TotalSpent'] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
    return df

# Convert data types
def convert_to_int(df):
    for col in ['InGroup', 'CryoSleep', 'VIP', 'Transported']:
        if col in df.columns:
            df[col] = df[col].astype(int)
    return df

In [33]:
target = 'Transported'

train = shuffle(train, random_state=42)
train, val = train_test_split(train, test_size=0.2, stratify=train[target], random_state=42)

X_train, y_train = train.drop(target, axis=1), train[target]
X_val, y_val = val.drop(target, axis=1), val[target]

In [None]:
cat_cols = info_df[info_df['Suggested Type'].isin(['Categorical','Binary'])].index.to_list()
num_cols = [col for col in train.columns if col not in cat_cols]
cat_cols.remove('Transported')

In [34]:
# The following variables will be used in the final pipelines
proportional_impute_cols = ['Destination','Deck','Side','CabinPosition','VIP','CryoSleep']
knnimputer_cols = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
ohe_cols = ['HomePlanet','Destination','Deck','Side','CabinPosition',,'InGroup','VIP','CryoSleep','GroupSize']
scaler_cols = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpent']
polynomial_feats = ['TotalSpent']
droppers = ['PassengerNumber','GroupId','Cabin','CabinNumber','Name']

In [38]:
manipulator = ColumnTransformer(
        transformers=[
                ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ohe_cols),
                ('transform_distribution', PowerTransformer(method='yeo-johnson'), scaler_cols),
                ('scale', StandardScaler(), scaler_cols)],
                remainder = 'drop')

pipeline = Pipeline([
    ('process', FunctionTransformer(process_dataframe)),
    ('drop', FunctionTransformer(drop_cols)),
    ('manipulate', manipulator),
    ('lr', LogisticRegression())])

pipeline.fit(X_train, y_train)

KeyError: 'Cabin'

In [13]:
pipeline.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('process',
                 FunctionTransformer(func=<function process_dataframe at 0x12d28fb00>)),
                ('manipulate',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['HomePlanet', 'Destination',
                                                   'Deck', 'Side',
                                                   'CabinPosition']),
                                                 ('transform_distribution',
                                                  PowerTransformer(),
                                                  ['Age', 'RoomService',
                                                   'FoodCourt', 'ShoppingMall',
                                                   'Spa'

In [20]:
lr = LogisticRegression(C=1)
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
lgbm = LGBMClassifier(n_estimators=100, max_depth=5, random_state=42)
xgbm = XGBClassifier(n_estimators=100, max_depth=5, random_state=42)
catb = CatBoostClassifier(iterations=100, depth=5, cat_features=ohe_cols, l2_leaf_reg=2)
svm = SVC(C=1, kernel='sigmoid', gamma='scale')

models = {'lr':lr, 'rf':rf, 'lgbm':lgbm, 'xgbm':xgbm, 'catb':catb}

---
---
## Pipeline

1. Create functions for use with FunctionTransformers

In [42]:
# Transform 'PassengerId'
def transform_passengerId(df):
    df['GroupId'] = df['PassengerId'].str.split('_').str[0]
    df['PassengerNumber'] = df['PassengerId'].str.split('_').str[1].astype(float)
    group_counts = df['GroupId'].value_counts()
    df['GroupSize'] = df['GroupId'].map(group_counts)
    df['InGroup'] = np.where(df['GroupSize'] > 1, 1, 0)
    return df

# Transform 'Cabin'
def transform_Cabin(df):
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['CabinNumber'] = df['Cabin'].str.split('/').str[1].astype(float)
    df['Side'] = df['Cabin'].str.split('/').str[2]
    bin_edges = np.linspace(df['CabinNumber'].min(), df['CabinNumber'].max(), 5)
    df['CabinPosition'] = pd.cut(df['CabinNumber'],
                                 bins=bin_edges,
                                 labels=['Front','Second','Third','Back'],
                                 include_lowest=True)
    return df

# Imputations for NaNs in 'HomePlanet'
def impute_homePlanet(df):
    group_modes = df.groupby('GroupId')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = group_modes[df['HomePlanet'].isna()]

    deck_modes = df.groupby('Deck')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = deck_modes[df['HomePlanet'].isna()]

    vip_mode_homePlanet = df.loc[df['VIP'] == True, 'HomePlanet'].mode().iloc[0]
    df.loc[df['VIP'] & df['HomePlanet'].isna(), 'HomePlanet'] = vip_mode_homePlanet

    df['HomePlanet'].fillna(df['HomePlanet'].mode().iloc[0], inplace=True)

    return df

# Create a proportional imputer
def proportional_imputer(df, impute_cols):
    for col in impute_cols:
        proportions = df.groupby('HomePlanet')[col].value_counts(normalize=True)

        def impute_values(row):
            if pd.isna(row[col]):
                group = row['HomePlanet']
                if pd.notna(group) and group in proportions.index:
                    group_proportions = proportions.loc[group].dropna()
                    return np.random.choice(group_proportions.index, p=group_proportions.values)
                return row[col]
            
        df[col] = df.apply(impute_values, axis=1)
    return df

def knn_imputer(df, columns):
    imputer = KNNImputer(n_neighbors=5)
    df[columns] = imputer.fit_transform(df[columns])
    return df

# Create a 'TotalSPent' feature
def create_totalSpent(df):
    df['TotalSpent'] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
    return df

# Convert data types
def convert_to_int(df):
    for col in ['InGroup', 'CryoSleep', 'VIP', 'Transported']:
        if col in df.columns:
            df[col] = df[col].astype(int)
    return df

2. Created a function to differentially select / add pipeline elements depending on the model being added to the pipeline

In [43]:
process_pipe = Pipeline(steps=[
        ('transform_passengerId', FunctionTransformer(transform_passengerId)),
        ('cabin_transformer', FunctionTransformer(transform_Cabin)),
        ('homeplanet_imputer', FunctionTransformer(impute_homePlanet)),
        ('proportional_imputer', FunctionTransformer(proportional_imputer, kw_args={'impute_cols':proportional_impute_cols})),
        ('knn_imputer', FunctionTransformer(knn_imputer, kw_args={'columns':knnimputer_cols})),
        ('create_totalSpent', FunctionTransformer(create_totalSpent)),
        ('dtype_conversions', FunctionTransformer(convert_to_int)),
        ('drop_cols', FunctionTransformer(lambda df: df.drop(['PassengerNumber','GroupId','Cabin','CabinNumber','Name'], axis=1, errors='ignore')))])

manipulator = ColumnTransformer(
        transformers=[
                ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ohe_cols),
                ('transform_distribution', PowerTransformer(method='yeo-johnson'), scaler_cols),
                ('scale', StandardScaler(), scaler_cols)],
                remainder = 'drop')

pipe_fin = Pipeline(steps=[
    ('process', process_pipe),
    ('manipulator', manipulator),
    ('mdel', rf)])

In [21]:
def get_pipeline(models):

    pipelines = {}

    for name, model in models.items():
        
        process_transformer = FunctionTransformer(process_dataframe)
        drop_cols = FunctionTransformer(drop_cols)

        if model.__class__.__name__ in ['LogisticRegression', 'SVC']:
            manipulator = ColumnTransformer(
                transformers=[
                ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ohe_cols),
                ('transform_distribution', PowerTransformer(method='yeo-johnson'), scaler_cols),
                ('scale', StandardScaler(), scaler_cols)],
                #('polynomials', PolynomialFeatures(degree=2, include_bias=False), polynomial_feats)
                remainder = 'passthrough')
            
        elif model.__class__.__name__ == 'CatBoostClassifier':
            manipulator = ColumnTransformer(
                transformers=[
                    ('scale', StandardScaler(), scaler_cols)],
                    remainder = 'passthrough')
            
        else:
            manipulator = ColumnTransformer(
                transformers=[
                    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ohe_cols),
                    ('scale', StandardScaler(), scaler_cols)],
                    remainder = 'passthrough')

        pipeline = Pipeline(steps=[
            ('preprocessor', process_transformer),
            ('manipulate', manipulator),
            ('drop', drop_cols),
            ('model', model)
            ])
        
        pipelines[name] = pipeline

    return pipelines

In [22]:
pipelines = get_pipeline(models)

UnboundLocalError: cannot access local variable 'drop_cols' where it is not associated with a value

In [9]:
pipelines['rf'].fit(X_train, y_train)

ValueError: could not convert string to float: 'F/830/S'

In [13]:
scores = cross_val_score(pipelines['rf'], X_train, y_train, cv=5, scoring='accuracy')

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Cabin'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 313, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py", line 252, in transform
    out = self._transform(X, func=self.func, kw_args=self.kw_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/preprocessing/_function_transformer.py", line 379, in _transform
    return func(X, **(kw_args if kw_args else {}))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/50/1z4vl3_96lj7wvqp_4kxvlm80000gn/T/ipykernel_9628/3222725985.py", line 82, in process_dataframe
    df = transform_Cabin(df)
         ^^^^^^^^^^^^^^^^^^^
  File "/var/folders/50/1z4vl3_96lj7wvqp_4kxvlm80000gn/T/ipykernel_9628/3222725985.py", line 12, in transform_Cabin
    df['Deck'] = df['Cabin'].str.split('/').str[0]
                 ~~^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/frame.py", line 3807, in __getitem__
    indexer = self.columns.get_loc(key)
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3804, in get_loc
    raise KeyError(key) from err
KeyError: 'Cabin'
