# Spaceship Titanic: Feature Selection
## Imports

In [1]:
import warnings
from pathlib import Path

import pandas as pd

In [2]:
warnings.simplefilter(action="ignore", category=FutureWarning)

## Read data

In [3]:
data_dir = Path.cwd().parent / "input" / "spaceship-titanic"
assert data_dir.exists(), f"directory doesn't exist: {data_dir}"

In [4]:
# Training data (Imputed)
df_train = pd.read_csv(
    data_dir / "train_imputed.csv",
    index_col="PassengerId",
    dtype={
        "CompCntReduced": pd.CategoricalDtype(categories=["0", "1", "2", "3+"], ordered=True),
        "HomePlanetOrd": pd.CategoricalDtype(categories=["0", "1", "2"]),
        "CabinDeckOrd": pd.CategoricalDtype(categories=["0", "1", "2", "3", "4", "5"]),
        "DestinationOrd": pd.CategoricalDtype(categories=["0", "1", "2"]),
        "DiscretizedAge4": pd.CategoricalDtype(categories=["0", "1", "2", "3"], ordered=True),
        "DiscretizedAge5": pd.CategoricalDtype(categories=["0", "1", "2", "3", "4"], ordered=True),
    },
)
df_train.head(10)

Unnamed: 0_level_0,Alone,CompCntReduced,HomePlanetOrd,CryoSleep,CabinDeckOrd,CabinPort,DestinationOrd,DiscretizedAge4,DiscretizedAge5,VIP,PosRoomService,PosFoodCourt,PosShoppingMall,PosSpa,PosVRDeck,PTTotalSpent,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0001_01,True,0,1,False,0,True,2,3,3,False,False,False,False,False,False,-1.140331,False
0002_01,True,0,0,False,3,False,2,1,2,False,True,True,True,True,True,0.576497,True
0003_01,False,1,1,False,5,False,2,3,4,True,True,True,False,True,True,1.501611,False
0003_02,False,1,1,False,5,False,2,2,3,False,False,True,True,True,True,1.242461,False
0004_01,True,0,0,False,3,False,2,0,0,False,True,True,True,True,True,0.704061,True
0005_01,True,0,0,False,3,True,1,3,4,False,False,True,False,True,False,0.592626,True
0006_01,False,1,0,False,3,False,2,1,2,False,True,True,True,False,False,0.828008,True
0006_02,False,1,0,True,4,False,2,2,2,False,False,False,False,False,False,-1.140331,True
0007_01,True,0,0,False,3,False,2,2,3,False,False,True,True,True,False,0.681374,True
0008_01,False,2,1,True,0,True,0,0,0,False,False,False,False,False,False,-1.140331,True


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Alone            8693 non-null   bool    
 1   CompCntReduced   8693 non-null   category
 2   HomePlanetOrd    8693 non-null   category
 3   CryoSleep        8693 non-null   bool    
 4   CabinDeckOrd     8693 non-null   category
 5   CabinPort        8693 non-null   bool    
 6   DestinationOrd   8693 non-null   category
 7   DiscretizedAge4  8693 non-null   category
 8   DiscretizedAge5  8693 non-null   category
 9   VIP              8693 non-null   bool    
 10  PosRoomService   8693 non-null   bool    
 11  PosFoodCourt     8693 non-null   bool    
 12  PosShoppingMall  8693 non-null   bool    
 13  PosSpa           8693 non-null   bool    
 14  PosVRDeck        8693 non-null   bool    
 15  PTTotalSpent     8693 non-null   float64 
 16  Transported      8693 non-null   bool 

In [6]:
assert df_train.isna().sum().eq(0).all()