## Feature Engineering

### Environment setting 

In [2]:
import pandas as pd
import numpy as np
import itertools
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

### Load data

In [3]:
# load data
df_train = pd.read_csv('data/train.csv')

### Split 'Cabin' feature

In [4]:
# split Cabin feature (takes the form deck/num/side, where side can be either P for Port or S for Starboard)
df_train[['Deck','Num','Side']] = df_train['Cabin'].str.split('/', expand = True)

### Imputation

In [5]:
# impute expenses by CryoSleep and Age
expenses_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df_train.loc[df_train['CryoSleep'] == True, expenses_features] = 0
df_train.loc[df_train['Age'] < 13, expenses_features] = 0

In [6]:
# simple imputation
# Age, expenses --> median
# VIP, Deck, Destination, HomePlanet, CryoSleep, Side --> mode

# Define columns
num_features = ['Age'] + expenses_features
cat_features = ['VIP', 'Destination', 'HomePlanet', 'CryoSleep', 'Deck', 'Side']

# Create imputation pipelines
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_imputer, num_features),
        ('cat', cat_imputer, cat_features)
    ])

preprocessor.fit(df_train)

# Apply to DataFrame
df_train[num_features + cat_features] = pd.DataFrame(
    preprocessor.transform(df_train),
    columns=num_features + cat_features
)

In [7]:
# calculate TotalExpenses
df_train['TotalExpenses'] = df_train[expenses_features].sum(axis=1)

In [8]:
# numerical standarization
# TODO Transform log10 expenses features
num_features = ['Age'] + expenses_features + ['TotalExpenses']
scaler_num = StandardScaler()
scaler_num.fit(df_train[num_features])
df_train[num_features] = scaler_num.transform(df_train[num_features])

In [9]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,TotalExpenses
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,0.711945,False,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,Maham Ofracculy,False,B,0,P,-0.514066
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,-0.334037,False,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,Juanna Vines,True,F,0,S,-0.251479
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,2.036857,True,-0.268001,1.959998,-0.283579,5.695623,-0.219796,Altark Susent,False,A,0,S,3.190333
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,0.293552,False,-0.333105,0.52301,0.336851,2.687176,-0.092818,Solam Susent,False,A,0,S,1.332604
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,-0.891895,False,0.125652,-0.237159,-0.031059,0.231374,-0.26124,Willy Santantines,True,F,1,S,-0.124824


In [10]:
# categorical encoding
cat_features = ['VIP', 'Destination', 'HomePlanet', 'CryoSleep', 'Deck', 'Side']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='if_binary')

encoder.fit(df_train[cat_features])
encoded_data = encoder.transform(df_train[cat_features])

features_onehot = encoder.get_feature_names_out(cat_features)

df_encoded = pd.DataFrame(encoded_data, columns=features_onehot)
df_train = pd.concat([df_train, df_encoded], axis = 1)

In [None]:
# TODO New feature Age < 12 TRUE
# TODO New feature log10(RoomService) < 1 TRUE
# TODO Same for Spa, VRDeck and TotalExpenses
# TODO Save features into cache file

In [12]:
# column for not B, C, E, F in Decj feature
df_train['Deck_notBCEF'] = df_train[['Deck_A','Deck_D','Deck_G','Deck_T']].sum(axis=1)

In [11]:
# TODO Video Imputation Kaggle
# TODO 1. TotalExpenses vs expenses features / all vs sel
# TODO 2. Deck B / Deck feature: B, C, E, F + Others