In [52]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv("data/train.csv")

In [40]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [3]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [65]:
def calculate_features(df: pd.DataFrame) -> pd.DataFrame:
    def map_age(df):
        df.Age = df.Age.fillna(28) # 28 - mean of age
        df.loc[ df['Age'] <= 16, 'Age'] = 0
        df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
        df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
        df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
        df.loc[ df['Age'] > 64, 'Age'] = 4

    def map_vip(df):
        df["IsVIP"] = df["VIP"].apply(lambda x: 1 if x else 0)

    def map_cabin(df):
        df["Cabin_deck"] = df["Cabin"].apply(lambda x: x.split("/")[0] if type(x) == str else 'UNKNOWN')
        df["Cabin_num"] = df["Cabin"].apply(lambda x: x.split("/")[1] if type(x) == str else 0)
        #df["Cabin_side"] = df["Cabin"].apply(lambda x: x.split("/")[2] if type(x) == str else 'UNKNOWN')
        df["Cabin_side_is_port"] = df["Cabin"].apply(lambda x: (1 if x.split("/")[2] == 'P' else 0) if type(x) == str else 0)

    def map_numbers_to_categories(df, column: str, buckets: int):
        df[column].fillna(np.mean(df[column]))
        df[column] = pd.qcut(df[column], buckets, labels=False, duplicates="drop")

    mapped_features = [
        "Age",
        "IsVIP",
        #"Cabin_num",
        "Cabin_side_is_port",
    ]

    label_encoded_features = []

    numbers_to_categories_features = [
        "RoomService",
        "FoodCourt",
        "ShoppingMall",
        "Spa",
        "VRDeck",
    ]

    dummy_features = [
        "HomePlanet",
        "Destination",
        "Cabin_deck",
        #"Cabin_side",
    ]

    df_copy = df.copy()
    map_age(df_copy)
    map_vip(df_copy)
    map_cabin(df_copy)
    for c in numbers_to_categories_features:
        map_numbers_to_categories(df_copy, c, 10)
    features = numbers_to_categories_features + label_encoded_features + dummy_features + mapped_features
    return pd.get_dummies(df_copy[features], columns=dummy_features)

In [66]:
train_features_df = calculate_features(train_df)
train_features_df.head()

Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Age,IsVIP,Cabin_side_is_port,HomePlanet_Earth,HomePlanet_Europa,...,Destination_TRAPPIST-1e,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_deck_UNKNOWN
0,0.0,0.0,0.0,0.0,0.0,2.0,0,1,0,1,...,1,0,1,0,0,0,0,0,0,0
1,1.0,0.0,1.0,2.0,1.0,1.0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
2,1.0,3.0,0.0,3.0,1.0,3.0,1,0,0,1,...,1,1,0,0,0,0,0,0,0,0
3,0.0,3.0,2.0,3.0,2.0,2.0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0
4,2.0,1.0,2.0,2.0,0.0,0.0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0


In [67]:
from pandas_profiling import ProfileReport
profile = ProfileReport(train_features_df, title="spaceship-titanic train features report")
train_features_df["Transported"] = train_df["Transported"]
profile.to_file("data_profiling_features.html")

Summarize dataset: 100%|██████████| 39/39 [00:05<00:00,  7.00it/s, Completed]                                  
Generate report structure: 100%|██████████| 1/1 [00:07<00:00,  7.98s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 142.94it/s]
