# Spaceship Titanic
### Predict which passengers are transported to an alternate dimension



### Source link: https://www.kaggle.com/competitions/spaceship-titanic/overview

In [537]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2_contingency

### All binary values in this dataset are: False = 0, True = 1

In [538]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission
#

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [539]:
# Initial reading the dataset
train_data = pd.read_csv('train.csv')
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [540]:
train_data['HomePlanet'].value_counts(dropna=False)

HomePlanet
Earth     4602
Europa    2131
Mars      1759
NaN        201
Name: count, dtype: int64

In [541]:
train_data['CryoSleep'].value_counts(dropna=False)

CryoSleep
False    5439
True     3037
NaN       217
Name: count, dtype: int64

In [542]:
train_data['Destination'].value_counts(dropna=False)

Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
NaN               182
Name: count, dtype: int64

In [543]:
train_data['VIP'].value_counts(dropna=False)

VIP
False    8291
NaN       203
True      199
Name: count, dtype: int64

In [544]:
train_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [545]:

train_data['Transported'] = train_data['Transported'].map({False: 0, True: 1})
train_data['VIP'] = train_data['VIP'].map({False: 0, True: 1})
train_data['CryoSleep'] = train_data['CryoSleep'].map({False: 0, True: 1})

In [546]:
categorical_features = ['HomePlanet', 'Destination']
encoder = OneHotEncoder(handle_unknown='ignore')
encoded = encoder.fit_transform(train_data[categorical_features]).toarray()

In [547]:
encoded_df = pd.DataFrame(
    encoded, 
    columns=encoder.get_feature_names_out(categorical_features),
    index=train_data.index
)

In [548]:
train_data_final = pd.concat(
    [train_data.drop(columns=categorical_features), encoded_df], 
    axis=1
)

In [549]:
train_data_final.columns = train_data_final.columns.str.lower()
train_data_final.columns = train_data_final.columns.str.replace(r"[ .-]", "_", regex=True)
train_data_final

Unnamed: 0,passengerid,cryosleep,cabin,age,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,name,transported,homeplanet_earth,homeplanet_europa,homeplanet_mars,homeplanet_nan,destination_55_cancri_e,destination_pso_j318_5_22,destination_trappist_1e,destination_nan
0,0001_01,0.0,B/0/P,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0002_01,0.0,F/0/S,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0003_01,0.0,A/0/S,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0003_02,0.0,A/0/S,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0004_01,0.0,F/1/S,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0.0,A/98/P,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
8689,9278_01,1.0,G/1499/S,18.0,0.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8690,9279_01,0.0,G/1500/S,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8691,9280_01,0.0,E/608/S,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [550]:
train_data_final['cabin'].value_counts()

cabin
G/734/S     8
F/1194/P    7
B/201/P     7
G/981/S     7
G/109/P     7
           ..
E/56/P      1
A/98/P      1
G/1499/S    1
G/1500/S    1
D/252/P     1
Name: count, Length: 6560, dtype: int64

In [551]:
train_data_final[['deck', 'num', 'side']] = train_data_final['cabin'].str.split('/', expand=True)
train_data_final = train_data_final.drop(columns=['cabin'])

In [552]:
train_data_final

Unnamed: 0,passengerid,cryosleep,age,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,name,...,homeplanet_europa,homeplanet_mars,homeplanet_nan,destination_55_cancri_e,destination_pso_j318_5_22,destination_trappist_1e,destination_nan,deck,num,side
0,0001_01,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,B,0,P
1,0002_01,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,F,0,S
2,0003_01,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,A,0,S
3,0003_02,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,A,0,S
4,0004_01,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0.0,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,A,98,P
8689,9278_01,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,G,1499,S
8690,9279_01,0.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,G,1500,S
8691,9280_01,0.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,E,608,S


In [553]:
categorical_feature_deck = ['deck', 'side']
encoder = OneHotEncoder(handle_unknown='ignore')
encoded = encoder.fit_transform(train_data_final[categorical_feature_deck]).toarray()

In [554]:
encoded_df = pd.DataFrame(
    encoded, 
    columns=encoder.get_feature_names_out(categorical_feature_deck),
    index=train_data_final.index
)

In [555]:
train_data_final = pd.concat(
    [train_data_final.drop(columns=categorical_feature_deck), encoded_df], 
    axis=1
)

In [556]:
train_data_final

Unnamed: 0,passengerid,cryosleep,age,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,name,...,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,deck_nan,side_P,side_S,side_nan
0,0001_01,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0002_01,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0003_01,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0003_02,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0004_01,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0.0,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8689,9278_01,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8690,9279_01,0.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
8691,9280_01,0.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [557]:
desired_order = ['passengerid', 'transported', 'name', 'age', 
                 'cryosleep', 'vip',
                 'roomservice', 'foodcourt', 'shoppingmall', 'spa', 'vrdeck', 
                 'homeplanet_earth', 'homeplanet_europa', 'homeplanet_mars', 'homeplanet_nan', 
                 'destination_55_cancri_e', 'destination_pso_j318_5_22', 'destination_trappist_1e', 'destination_nan', 
                 'deck_A', 'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T', 'deck_nan', 
                 'num',
                 'side_P', 'side_S', 'side_nan'
                 ]

In [558]:
train_data_final = train_data_final[desired_order]
train_data_final

Unnamed: 0,passengerid,transported,name,age,cryosleep,vip,roomservice,foodcourt,shoppingmall,spa,...,deck_D,deck_E,deck_F,deck_G,deck_T,deck_nan,num,side_P,side_S,side_nan
0,0001_01,0,Maham Ofracculy,39.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0
1,0002_01,1,Juanna Vines,24.0,0.0,0.0,109.0,9.0,25.0,549.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0
2,0003_01,0,Altark Susent,58.0,0.0,1.0,43.0,3576.0,0.0,6715.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
3,0003_02,0,Solam Susent,33.0,0.0,0.0,0.0,1283.0,371.0,3329.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
4,0004_01,1,Willy Santantines,16.0,0.0,0.0,303.0,70.0,151.0,565.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,Gravior Noxnuther,41.0,0.0,1.0,0.0,6819.0,0.0,1643.0,...,0.0,0.0,0.0,0.0,0.0,0.0,98,1.0,0.0,0.0
8689,9278_01,0,Kurta Mondalley,18.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1499,0.0,1.0,0.0
8690,9279_01,1,Fayey Connon,26.0,0.0,0.0,0.0,0.0,1872.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1500,0.0,1.0,0.0
8691,9280_01,0,Celeon Hontichre,32.0,0.0,0.0,0.0,1049.0,0.0,353.0,...,0.0,1.0,0.0,0.0,0.0,0.0,608,0.0,1.0,0.0


In [559]:
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [560]:
train_data_final

Unnamed: 0,passengerid,transported,name,age,cryosleep,vip,roomservice,foodcourt,shoppingmall,spa,...,deck_D,deck_E,deck_F,deck_G,deck_T,deck_nan,num,side_P,side_S,side_nan
0,0001_01,0,Maham Ofracculy,39.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0
1,0002_01,1,Juanna Vines,24.0,0.0,0.0,109.0,9.0,25.0,549.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0
2,0003_01,0,Altark Susent,58.0,0.0,1.0,43.0,3576.0,0.0,6715.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
3,0003_02,0,Solam Susent,33.0,0.0,0.0,0.0,1283.0,371.0,3329.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
4,0004_01,1,Willy Santantines,16.0,0.0,0.0,303.0,70.0,151.0,565.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,Gravior Noxnuther,41.0,0.0,1.0,0.0,6819.0,0.0,1643.0,...,0.0,0.0,0.0,0.0,0.0,0.0,98,1.0,0.0,0.0
8689,9278_01,0,Kurta Mondalley,18.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1499,0.0,1.0,0.0
8690,9279_01,1,Fayey Connon,26.0,0.0,0.0,0.0,0.0,1872.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1500,0.0,1.0,0.0
8691,9280_01,0,Celeon Hontichre,32.0,0.0,0.0,0.0,1049.0,0.0,353.0,...,0.0,1.0,0.0,0.0,0.0,0.0,608,0.0,1.0,0.0


In [561]:
new_training_data = train_data_final.drop(['passengerid', 'name'], axis='columns')
new_training_data

Unnamed: 0,transported,age,cryosleep,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,homeplanet_earth,...,deck_D,deck_E,deck_F,deck_G,deck_T,deck_nan,num,side_P,side_S,side_nan
0,0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0
1,1,24.0,0.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0
2,0,58.0,0.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
3,0,33.0,0.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
4,1,16.0,0.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,41.0,0.0,1.0,0.0,6819.0,0.0,1643.0,74.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,98,1.0,0.0,0.0
8689,0,18.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1499,0.0,1.0,0.0
8690,1,26.0,0.0,0.0,0.0,0.0,1872.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1500,0.0,1.0,0.0
8691,0,32.0,0.0,0.0,0.0,1049.0,0.0,353.0,3235.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,608,0.0,1.0,0.0


In [562]:
train_data_final

Unnamed: 0,passengerid,transported,name,age,cryosleep,vip,roomservice,foodcourt,shoppingmall,spa,...,deck_D,deck_E,deck_F,deck_G,deck_T,deck_nan,num,side_P,side_S,side_nan
0,0001_01,0,Maham Ofracculy,39.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0
1,0002_01,1,Juanna Vines,24.0,0.0,0.0,109.0,9.0,25.0,549.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0
2,0003_01,0,Altark Susent,58.0,0.0,1.0,43.0,3576.0,0.0,6715.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
3,0003_02,0,Solam Susent,33.0,0.0,0.0,0.0,1283.0,371.0,3329.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0
4,0004_01,1,Willy Santantines,16.0,0.0,0.0,303.0,70.0,151.0,565.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,Gravior Noxnuther,41.0,0.0,1.0,0.0,6819.0,0.0,1643.0,...,0.0,0.0,0.0,0.0,0.0,0.0,98,1.0,0.0,0.0
8689,9278_01,0,Kurta Mondalley,18.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1499,0.0,1.0,0.0
8690,9279_01,1,Fayey Connon,26.0,0.0,0.0,0.0,0.0,1872.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1500,0.0,1.0,0.0
8691,9280_01,0,Celeon Hontichre,32.0,0.0,0.0,0.0,1049.0,0.0,353.0,...,0.0,1.0,0.0,0.0,0.0,0.0,608,0.0,1.0,0.0


### Looking for dependencies between the transported passenger and their home and destination planets.

In [563]:
categorical_features_home_planet = ['homeplanet_earth', 'homeplanet_mars', 'homeplanet_europa', 'homeplanet_nan']

In [564]:
def home_planet_dependence(feature):
    prefix, planet_name = feature.split('_') 
    contingency = pd.crosstab(train_data_final['transported'], train_data_final[feature])
    chi2, p, dof, expected = chi2_contingency(contingency)
    print(f"Chi2 = {chi2}, p-value [{planet_name.capitalize()}] = {p}")

In [565]:
for planet in categorical_features_home_planet:
    home_planet_dependence(planet)

Chi2 = 247.65992650542287, p-value [Earth] = 8.406325910587839e-56
Chi2 = 3.223704671862504, p-value [Mars] = 0.07257919633197293
Chi2 = 271.2641201798859, p-value [Europa] = 6.017306168371327e-61
Chi2 = 0.03294467954367926, p-value [Nan] = 0.8559698802824356


In [566]:
categorical_features_destination = ['destination_55_cancri_e', 'destination_pso_j318_5_22', 'destination_trappist_1e', 'destination_nan']

def destination_planet_dependence(feature):
    contingency = pd.crosstab(train_data_final['transported'], train_data_final[feature])
    chi2, p, dof, expected = chi2_contingency(contingency)
    print(f"Chi2 = {chi2}, p-value [{feature}] = {p}")
    
for planet in categorical_features_destination:
    destination_planet_dependence(planet)

Chi2 = 102.22006532700334, p-value [destination_55_cancri_e] = 4.9684099448378385e-24
Chi2 = 0.0, p-value [destination_pso_j318_5_22] = 1.0
Chi2 = 77.55359540397988, p-value [destination_trappist_1e] = 1.2916947128828519e-18
Chi2 = 0.0, p-value [destination_nan] = 1.0


### Looking for dependencies between the transported passenger and their cabin on the board.

In [567]:
categorical_features_deck = ['deck_A', 'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T', 'deck_nan']

def destination_planet_deck(feature):
    contingency = pd.crosstab(train_data_final['transported'], train_data_final[feature])
    chi2, p, dof, expected = chi2_contingency(contingency)
    print(f"Chi2 = {chi2}, p-value [{feature}] = {p}")
    
for deck in categorical_features_deck:
    destination_planet_deck(deck)

Chi2 = 0.03281433940267704, p-value [deck_A] = 0.8562519692560915
Chi2 = 181.08585382737022, p-value [deck_B] = 2.8076193512853426e-41
Chi2 = 100.98748744699768, p-value [deck_C] = 9.256634759964273e-24
Chi2 = 9.779849068871993, p-value [deck_D] = 0.00176434796434866
Chi2 = 82.77802983078504, p-value [deck_E] = 9.180471320492237e-20
Chi2 = 66.56603688635329, p-value [deck_F] = 3.3836605195670995e-16
Chi2 = 2.2299840687840553, p-value [deck_G] = 0.13535524332344628
Chi2 = 0.8297722727930931, p-value [deck_T] = 0.36233838696054377
Chi2 = 0.0, p-value [deck_nan] = 1.0


In [568]:
categorical_features_side = ['side_P', 'side_S', 'side_nan']

def destination_planet_side(feature):
    contingency = pd.crosstab(train_data_final['transported'], train_data_final[feature])
    chi2, p, dof, expected = chi2_contingency(contingency)
    print(f"Chi2 = {chi2}, p-value [{feature}] = {p}")
    
for side in categorical_features_side:
    destination_planet_deck(side)

Chi2 = 88.9710828350531, p-value [side_P] = 4.0062612965840844e-21
Chi2 = 89.07319622428383, p-value [side_S] = 3.804712593269947e-21
Chi2 = 0.0, p-value [side_nan] = 1.0
