In [31]:
# First code block is the import libraries  
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

pd.reset_option('^display.', silent=True)

In [14]:
def data_splitting(df: pd.DataFrame):
    df_new = df
        
    # handling the seat of passengers 
    df_new[['Cabin_deck','Cabin_num','Cabin_side']] = df['Cabin'].str.split('/',expand=True)

    # handling spending of each passengers : RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
    df_new[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = df[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].fillna(0)
    df_new.loc[:, ['Total_spending']] = df_new['RoomService'] + df_new['FoodCourt'] + df_new['ShoppingMall'] + df_new['Spa'] + df_new['VRDeck']

    if 'Transported' in df_new.columns:
        df_processedData = df_new[
            [   'PassengerId', 'HomePlanet', 'CryoSleep',
                'Cabin_deck', 'Cabin_num', 'Cabin_side', 
                'Destination', 'Age',
                'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Total_spending',
                'Name',
                'Transported'
            ]
        ]
    else:
        df_processedData = df_new[
            [   'PassengerId', 'HomePlanet', 'CryoSleep',
                'Cabin_deck', 'Cabin_num', 'Cabin_side', 
                'Destination', 'Age',
                'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Total_spending',
                'Name'
            ]
        ]
    
    return df_processedData

In [15]:
def categorize_spending(df: pd.DataFrame):
    df_new = df

    mean_value = df_new['Total_spending'].mean()
    std_value = df_new['Total_spending'].std()

    thresholds = [
        float('-inf'), 
        mean_value - 0.5 * std_value,
        mean_value + 0.5 * std_value, 
        float('inf')
    ]

    df_new['spending_Categ'] = pd.cut(df['Total_spending'], bins=thresholds, labels=['Low', 'Medium', 'High'])

    return df_new

In [16]:
def dropCols(df: pd.DataFrame):

    if 'Transported' in df.columns:
        df_new = df[
            [   'HomePlanet', 'CryoSleep',
                'Cabin_deck', 'Cabin_side', 
                'Age',
                'spending_Categ',
                'Transported'
            ]
        ]
    else:
        df_new = df[
            [   'HomePlanet', 'CryoSleep',
                'Cabin_deck', 'Cabin_side', 
                'Age',
                'spending_Categ'
            ]
        ]
    return df_new

In [17]:
def fill_null(df: pd.DataFrame):
    # fill null record: HomePlanet, CryoSleep, Cabin_deck, Cabin_side, Age
    df_new = df
    df_new['CryoSleep'] = df['CryoSleep'].fillna(value=False)
    df_new['Age'] = df['Age'].fillna(df['Age'].mean())


    return df_new

In [18]:
def oneHot(df: pd.DataFrame):
    # apply one-hot encoding to non-numberic data
    one_hot= pd.DataFrame()
    df['CryoSleep'] = df['CryoSleep'].astype(bool)
    one_hot = pd.get_dummies(df)

    # reorder the dataframe
    if 'Transported' in df.columns:
        one_hot = one_hot[['CryoSleep', 
            'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 
            'Cabin_deck_A', 'Cabin_deck_B', 'Cabin_deck_C', 
            'Cabin_deck_D', 'Cabin_deck_E', 'Cabin_deck_F',
            'Cabin_deck_G', 'Cabin_deck_T', 
            'Cabin_side_P', 'Cabin_side_S',
            'Age',
            'spending_Categ_Low', 'spending_Categ_Medium', 'spending_Categ_High',
            'Transported'
        ]]
    else:
        one_hot = one_hot[['CryoSleep', 
            'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 
            'Cabin_deck_A', 'Cabin_deck_B', 'Cabin_deck_C', 
            'Cabin_deck_D', 'Cabin_deck_E', 'Cabin_deck_F',
            'Cabin_deck_G', 'Cabin_deck_T', 
            'Cabin_side_P', 'Cabin_side_S',
            'Age',
            'spending_Categ_Low', 'spending_Categ_Medium', 'spending_Categ_High'
        ]]

    return one_hot

In [19]:
def dataProcess(df: pd.DataFrame):
    
    df_new = data_splitting(df)
    df_new = categorize_spending(df_new)
    df_new = dropCols(df_new)
    df_new = fill_null(df_new)
    df_new['CryoSleep'] = df_new['CryoSleep'].astype(bool) # set Na to False
    df_new = oneHot(df_new)

    return df_new

In [32]:
rawTrain_dataPath = '../spaceship-titanic_rawData/spaceship_train.csv'
rawTest_dataPath = '../spaceship-titanic_rawData/spaceship_test.csv'


df_public = pd.read_csv(rawTrain_dataPath)
df_private = pd.read_csv(rawTest_dataPath)

pre_train = dataProcess(df_public)
pre_test = dataProcess(df_private)



In [34]:
# Export result
filename = str(os.path.basename(globals()['__vsc_ipynb_file__'])).replace('.ipynb', '')

train_export = '../preprocess_train_dataset/' + filename + '_train.csv'
test_export = '../preprocess_test_dataset/' + filename + '_test.csv'

pre_train.to_csv(train_export, sep=',', encoding='utf-8', index=False)
pre_test.to_csv(test_export, sep=',', encoding='utf-8', index=False)