In [1]:
# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Data Handling and Processing
import numpy as np
import pandas as pd
import math
from sklearn.impute import KNNImputer
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, PowerTransformer
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import viztoolz as viz
import mltoolz as mlt
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Model Selection, Metrics & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Pipeline Construction 
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

In [2]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

print('-'*16)
print(f'Train Set Shape:\n{train.shape}')
print('-'*16)
print(f'Test Set Shape:\n{test.shape}')
print('-'*16)

----------------
Train Set Shape:
(8693, 14)
----------------
Test Set Shape:
(4277, 13)
----------------


---
### Initial view of train set datafram info

In [3]:
mlt.describe_and_suggest(train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns)
dtypes: object(7), float64(6), bool(1)
memory usage: 3461.6 KB

Total Percentage of Null Values: 26.73%


Unnamed: 0,Data Type,Not-Null,Missing,Missing (%),Unique,Cardinality (%),Suggested Type
PassengerId,object,8693,0,0.0,8693,100.0,Categorical
HomePlanet,object,8492,201,2.31,3,0.03,Categorical
CryoSleep,object,8476,217,2.5,2,0.02,Binary
Cabin,object,8494,199,2.29,6560,75.46,Categorical
Destination,object,8511,182,2.09,3,0.03,Categorical
Age,float64,8514,179,2.06,80,0.92,Numerical Discrete
VIP,object,8490,203,2.34,2,0.02,Binary
RoomService,float64,8512,181,2.08,1273,14.64,Numerical Continuous
FoodCourt,float64,8510,183,2.11,1507,17.34,Numerical Continuous
ShoppingMall,float64,8485,208,2.39,1115,12.83,Numerical Continuous


---
### Functions required for imputations and transformations

In [4]:
# Transform 'PassengerId'
def transform_passengerId(df):
    df['GroupId'] = df['PassengerId'].str.split('_').str[0]
    df['PassengerNumber'] = df['PassengerId'].str.split('_').str[1].astype(float)
    group_counts = df['GroupId'].value_counts()
    df['GroupSize'] = df['GroupId'].map(group_counts)
    df['InGroup'] = np.where(df['GroupSize'] > 1, 1, 0)
    return df

# Transform 'Cabin'
def transform_Cabin(df):
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['CabinNumber'] = df['Cabin'].str.split('/').str[1].astype(float)
    df['Side'] = df['Cabin'].str.split('/').str[2]
    bin_edges = np.linspace(df['CabinNumber'].min(), df['CabinNumber'].max(), 5)
    df['CabinPosition'] = pd.cut(df['CabinNumber'],
                                 bins=bin_edges,
                                 labels=['Front','Second','Third','Back'],
                                 include_lowest=True)
    return df

# Imputations for NaNs in 'HomePlanet'
def impute_homePlanet(df):
    group_modes = df.groupby('GroupId')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = group_modes[df['HomePlanet'].isna()]

    deck_modes = df.groupby('Deck')['HomePlanet'].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = deck_modes[df['HomePlanet'].isna()]

    if 'VIP' in df.columns:
        vip_mode_homePlanet = df.loc[df['VIP'] == True, 'HomePlanet'].mode().iloc[0]
        df.loc[df['VIP'] & df['HomePlanet'].isna(), 'HomePlanet'] = vip_mode_homePlanet

    df['HomePlanet'].fillna(df['HomePlanet'].mode().iloc[0], inplace=True)

    return df

# Proportional imputer for categorical columns
def proportional_imputer(df, impute_cols):
    for col in impute_cols:
        proportions = df.groupby('HomePlanet')[col].value_counts(normalize=True)

        def impute_values(row):
            if pd.isna(row[col]):
                group = row['HomePlanet']
                if pd.notna(group) and group in proportions.index:
                    group_proportions = proportions.loc[group].dropna()
                    return np.random.choice(group_proportions.index, p=group_proportions.values)
            return row[col]
        
        # Apply the impute function to each column
        df[col] = df.apply(impute_values, axis=1)
    return df

# KNN imputation for numerical columns
def knn_imputer(df, columns):
    imputer = KNNImputer(n_neighbors=5)
    df[columns] = imputer.fit_transform(df[columns])
    return df

# Create 'TotalSpent' feature
def create_totalSpent(df):
    df['TotalSpent'] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
    return df

# Create 'service_used' and 'big_spender' binary features
def create_serviceSpenders(df):

    num_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpent'] 
    iqr_limits = {col: df[col].quantile(0.75) + 1.5 * (df[col].quantile(0.75) - df[col].quantile(0.25)) for col in num_cols}

    for col in num_cols:
        if col != 'TotalSpent':
            df[f'{col}_used'] = df[col].apply(lambda x: 1 if x > 0 else 0).astype(int)

        outlier_limit = iqr_limits[col]
        df[f'{col}_big_spender'] = df[col].apply(lambda x: 1 if x > outlier_limit else 0).astype(int)
    
    return df

# Convert specific columns to integers
def convert_to_int(df):
    for col in ['InGroup', 'CryoSleep', 'VIP', 'Transported']:
        if col in df.columns:
            df[col] = df[col].astype(int)
    return df

# Drop unwanted columns
def drop_cols(df):
    droppers = ['PassengerNumber','GroupId','Cabin','CabinNumber','Name']
    df.drop(droppers, axis=1, inplace=True)
    return df

# Main function to process DataFrame in order
def process_dataframe(df):
    df = transform_passengerId(df)
    df = transform_Cabin(df)
    df = impute_homePlanet(df)
    df = proportional_imputer(df, impute_cols=['Destination', 'Deck', 'Side', 'CabinPosition', 'VIP', 'CryoSleep'])
    df = knn_imputer(df, columns=['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'])
    df = create_totalSpent(df)
    df = create_serviceSpenders(df)
    df = convert_to_int(df)
    df = drop_cols(df)
    return df

---
### Process train set and view info again

In [5]:
process_dataframe(train)
info_df = mlt.describe_and_suggest(train)
info_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 29 columns)
dtypes: int64(16), float64(7), object(6)
memory usage: 4315.5 KB

Total Percentage of Null Values: 0.00%


Unnamed: 0,Data Type,Not-Null,Missing,Missing (%),Unique,Cardinality (%),Suggested Type
PassengerId,object,8693,0,0.0,8693,100.0,Categorical
HomePlanet,object,8693,0,0.0,3,0.03,Categorical
CryoSleep,int64,8693,0,0.0,2,0.02,Binary
Destination,object,8693,0,0.0,3,0.03,Categorical
Age,float64,8693,0,0.0,131,1.51,Numerical Discrete
VIP,int64,8693,0,0.0,2,0.02,Binary
RoomService,float64,8693,0,0.0,1344,15.46,Numerical Continuous
FoodCourt,float64,8693,0,0.0,1566,18.01,Numerical Continuous
ShoppingMall,float64,8693,0,0.0,1180,13.57,Numerical Continuous
Spa,float64,8693,0,0.0,1396,16.06,Numerical Continuous


In [6]:
train.to_csv('../data/processed/train.csv')