## Feature Engineering

### Environment setting 

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Load data

In [2]:
# load data
df_train = pd.read_csv('data/train.csv')

### Split 'Cabin' feature

In [3]:
# split Cabin feature (takes the form deck/num/side, where side can be either P for Port or S for Starboard)
df_train[['Deck','Num','Side']] = df_train['Cabin'].str.split('/', expand = True)

### Imputation

In [4]:
# impute expenses by CryoSleep and Age
expenses_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df_train.loc[df_train['CryoSleep'] == True, expenses_features] = 0.0
df_train.loc[df_train['Age'] < 13, expenses_features] = 0.0

In [5]:
# simple imputation
# Age, expenses --> median
# VIP, Deck, Destination, HomePlanet, CryoSleep, Side --> mode

# Define columns
num_features = ['Age'] + expenses_features
cat_features = ['VIP', 'Destination', 'HomePlanet', 'CryoSleep', 'Deck', 'Side']

# Create imputation pipelines
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_imputer, num_features),
        ('cat', cat_imputer, cat_features)
    ])

preprocessor.fit(df_train)

# Apply to DataFrame
df_train[num_features + cat_features] = pd.DataFrame(
    preprocessor.transform(df_train),
    columns=num_features + cat_features
)

### New features

In [6]:
# feature Age < 12 (True/False)
df_train['Age12'] = df_train['Age'].apply(lambda x: 1.0 if x < 12 else 0.0)

In [7]:
# calculate TotalExpenses
df_train['TotalExpenses'] = df_train[expenses_features].sum(axis=1)

In [8]:
# log10 transformation for expense features
expenses_features_total = expenses_features + ['TotalExpenses']
expenses_features_log10 = list(map(lambda x: 'log10_' + x, expenses_features_total))
df_train[expenses_features_log10] = df_train[expenses_features_total].apply(lambda x: np.log10(np.float64(x + 1)))

In [9]:
# log10 expense features classification < 1 (True/False)
for feature in expenses_features_log10:
    new_feature = f'{feature}1'
    df_train[new_feature] = df_train[feature].apply(lambda x: 1.0 if x < 1 else 0.0)

In [10]:
# numerical standarization
num_features = ['Age'] + expenses_features + expenses_features_log10
scaler_num = StandardScaler()
scaler_num.fit(df_train[num_features])
df_train[num_features] = scaler_num.transform(df_train[num_features])

In [11]:
# categorical encoding
cat_features = ['VIP', 'Destination', 'HomePlanet', 'CryoSleep', 'Deck', 'Side']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='if_binary')

encoder.fit(df_train[cat_features])
encoded_data = encoder.transform(df_train[cat_features])

features_onehot = encoder.get_feature_names_out(cat_features)

df_encoded = pd.DataFrame(encoded_data, columns=features_onehot)
df_train = pd.concat([df_train, df_encoded], axis = 1)

In [12]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
PassengerId,0001_01,0002_01,0003_01,0003_02,0004_01
HomePlanet,Europa,Earth,Europa,Europa,Earth
CryoSleep,False,False,False,False,False
Cabin,B/0/P,F/0/S,A/0/S,A/0/S,F/1/S
Destination,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e
Age,0.711945,-0.334037,2.036857,0.293552,-0.891895
VIP,False,False,True,False,False
RoomService,-0.333105,-0.168073,-0.268001,-0.333105,0.125652
FoodCourt,-0.281027,-0.275387,1.959998,0.52301,-0.237159
ShoppingMall,-0.283579,-0.241771,-0.283579,0.336851,-0.031059


### Save features

In [13]:
cache_data = dict(proc_data=df_train, preprocessor=preprocessor, encoder=encoder, scaler=scaler_num)
cache_file = "procData.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: procData.pkl
