In [1]:
import os
os.environ['HSA_OVERRIDE_GFX_VERSION'] = '10.3.0'

# basic modules
import pandas as pd
import numpy as np
import seaborn as sns

from scipy.stats import chi2_contingency
from sklearn.feature_selection import chi2

Read the data.

In [2]:
pd.set_option('display.max_columns',150)

data_path = './data/'
train = 'train.csv'

train_file = os.path.join(data_path,train)
df = pd.read_csv(train_file)

EDA.

In [3]:
print(df.info())

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
None


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


**Engineer 'PassengerId' column**:
- The 'PassengerId' column is of the form: gggg_pp where gggg indicates the group they are traveling with and pp is their number within the group. By extracting these values we can create meaning from this column and hopefully use it to impute missing values later.

In [4]:
df['GroupId'] = df['PassengerId'].apply(lambda x: x.split("_")[0]).astype(int)
df['PersonId'] = df['PassengerId'].apply(lambda x: x.split("_")[1]).astype(int)
df.drop(columns=['PassengerId'], inplace=True)

**Engineer the 'Cabin' column:**
- The 'Cabin' column is of the form deck/num/side. We can engineer this feature to extract meaningful info for the column.

In [5]:
df['Deck'] = df[df['Cabin'].notna()]['Cabin'].str.split('/').apply(lambda x: x[0])
df['Num'] = df[df['Cabin'].notna()]['Cabin'].str.split('/').apply(lambda x: x[1])
df['Side'] = df[df['Cabin'].notna()]['Cabin'].str.split('/').apply(lambda x: x[2])
df.drop(columns=['Cabin'],inplace=True)

**Extract the last names**

In [6]:
df['FirstName'] = df[df['Name'].notna()]['Name'].str.split(' ').apply(lambda x: x[0].strip())
df['LastName'] = df[df['Name'].notna()]['Name'].str.split(' ').apply(lambda x: x[1].strip())
df.drop(columns=['Name'], inplace = True)

***I need to determine if I should proceed with imputation of HomePlanet and Destination fields. These values might not be MCAR:***

In [7]:
contingency = pd.crosstab(df['HomePlanet'], df['Transported'], dropna=False)

print(contingency)

c, p, dof, expected = chi2_contingency(contingency)

print(c,p,dof)
print(expected)

Transported  False  True 
HomePlanet               
Earth         2651   1951
Europa         727   1404
Mars           839    920
NaN             98    103
324.96723663979583 3.9214919240932375e-70 3
[[2284.32416887 2317.67583113]
 [1057.77809732 1073.22190268]
 [ 873.12607845  885.87392155]
 [  99.77165535  101.22834465]]


Observation
- We can see that the frequency table for the missing home planet and the chi-2 expected frequency values are nearly the same, which indicates that there isn't an apparent association between a home planet being missing and being successfully transported or not. This hints at homeplanet being missing being an MCAR value, which means we will proceed with imputation of this feature.

**Impute home planet based off of groups of GroupdId for:**
  - LastName of group all being the same
  - There only being a single type of home planet in the group

In [8]:
# get a list of dataframes where:
# - all share the same groupId
# - all share the same last name
# - all have at least 1 missing home planet
# - have only a single unique non-missing home planet
def get_groups_0(df) -> list:
  groups = []
  group_ids = df['GroupId'].unique().tolist()
  for group_id in group_ids:

    # get sub-dataframe based off of group id
    group_df = df[df['GroupId'] == group_id]
  
    has_missing_planet = group_df['HomePlanet'].isna().any()
    if not has_missing_planet:
      continue
    
    has_one_distinct_planet = group_df['HomePlanet'].dropna().nunique() == 1
    if not has_one_distinct_planet:
      continue
    
    has_one_last_name = group_df['LastName'].nunique() == 1
    if not has_one_last_name:
      continue
    
    if (
      has_missing_planet and
      has_one_distinct_planet and
      has_one_last_name
    ):
      groups.append(group_df)
      
  return groups

group_dfs = get_groups_0(df) 
print(f"Number of groups with some missing home planet and all same last names: {len(group_dfs)}")

while group_dfs:
  group_df = group_dfs[-1]
  home_planets = group_df['HomePlanet'].dropna().unique()
  if len(home_planets) > 1:
    raise ValueError(home_planets)
  home_planet = home_planets[0]
  df.loc[group_df['HomePlanet'].isna().index, 'HomePlanet'] = home_planet
  group_dfs.pop()
  
print(len(group_dfs))

Number of groups with some missing home planet and all same last names: 67
0


**Imput Home Planets by:**
  - Groups where LastName are all the same
  - There is only one unique type of home planet in the group by last name

In [9]:
def get_groups_1(df):
    groups =[]
    last_names = df['LastName'].unique().tolist()
    for last_name in last_names:
        df_last_name = df[df['LastName'] == last_name]

        if not df_last_name['HomePlanet'].isna().any():
            continue
        
        if not df_last_name['HomePlanet'].dropna().nunique() == 1:
            continue

        groups.append(df_last_name)
        
    return groups

group_dfs = get_groups_1(df)

print(len(group_dfs))

while group_dfs:
    group_df = group_dfs[-1]
    
    planets = group_df['HomePlanet'].dropna().unique().tolist()
    
    if len(planets) != 1:
        raise ValueError("HUH")

    df.loc[group_df['HomePlanet'].isna().index, 'HomePlanet'] = planets[0]
    
    group_dfs.pop()
    
print(f'samples remaining with missing home planets: {df["HomePlanet"].isna().sum()}')

119
samples remaining with missing home planets: 12


**Check remaining missing home planets**

In [10]:
df[df['HomePlanet'].isna()]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupId,PersonId,Deck,Num,Side,FirstName,LastName
225,,False,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,False,242,1,F,46,S,Almone,Sté
234,,True,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,True,251,1,C,11,S,Diphah,Amsive
807,,True,55 Cancri e,38.0,False,0.0,0.0,0.0,0.0,0.0,True,853,1,A,9,S,Hamelik,Ageurante
1855,,True,TRAPPIST-1e,19.0,False,0.0,0.0,0.0,0.0,0.0,True,1978,1,G,311,S,,
2274,,False,TRAPPIST-1e,31.0,False,1458.0,421.0,76.0,0.0,0.0,False,2443,1,D,72,P,,
2631,,False,TRAPPIST-1e,25.0,False,237.0,0.0,910.0,0.0,12.0,False,2817,1,F,584,P,Sealfs,Sutty
3091,,False,TRAPPIST-1e,40.0,False,666.0,4.0,83.0,0.0,50.0,True,3331,1,F,631,S,,
4548,,True,TRAPPIST-1e,36.0,False,0.0,0.0,,0.0,0.0,True,4840,1,F,915,S,,
5252,,False,TRAPPIST-1e,34.0,False,170.0,1256.0,0.0,3926.0,7121.0,False,5603,1,E,365,S,Kocha,Cluitty
5634,,False,TRAPPIST-1e,20.0,False,0.0,0.0,,703.0,0.0,False,5989,1,F,1141,S,Darrie,Holcompton


***Repeat the same process from earlier but for missing destination planets:***

We will be more strict about imputation of destination planets.

In [11]:
contingency = pd.crosstab(df['Destination'], df['Transported'], dropna=False)

print(contingency)

c, p, dof, expected = chi2_contingency(contingency)

print(c,p,dof)
print(expected)

Transported    False  True 
Destination                
55 Cancri e      702   1098
PSO J318.5-22    395    401
TRAPPIST-1e     3128   2787
NaN               90     92
106.39488238823832 6.547434028958798e-23 3
[[ 893.47751064  906.52248936]
 [ 395.11561026  400.88438974]
 [2936.06637524 2978.93362476]
 [  90.34050385   91.65949615]]


Obvservation:
    - Similar case as the home planetplanet feature. These missing destination planets appear to be MCAR due to the low assocation. We will impute the missing values.

In [12]:
# Get groups where:
# - group ids are all the same
# - last names are all the same
# - home planets are all the same
# - at least one missing destination planet
# - only one distinct non-na destination planet
def get_group_2(df):
  groups = []
  group_ids = df['GroupId'].unique().tolist()
  for group_id in group_ids:
    
    group_df = df[df['GroupId'] == group_id]

    # if any of the home planets are missing
    if group_df['HomePlanet'].isna().any():
        continue
    
    # if the group isn't missing any destination planets
    if not group_df['Destination'].isna().any():
      continue

    # if the group contains more than one distinct destination planet
    if not group_df['Destination'].dropna().nunique() == 1:
      continue
    
    # if the group contains more than one kind of last name
    if not group_df['LastName'].dropna().nunique() == 1:
      continue
    
    groups.append(group_df)
    
  return groups
    
groups = get_group_2(df)
print(len(groups))

SyntaxError: unterminated string literal (detected at line 14) (2562427147.py, line 14)

In [None]:
print(groups[0])

In [None]:
print("Hwat")

In [2]:
import numpy as np

print("HI")

HI


In [5]:
!jupyter nbextension enable varInspector

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console contrib dejavu events execute kernel kernelspec
lab labextension labhub migrate nbconvert nbextensions_configurator notebook
qtconsole run server theme troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


In [1]:
pip install jupyter_contrib_nbextensions

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install jupyter_contrib_nbextensions

Collecting jupyter_contrib_nbextensions
  Using cached jupyter_contrib_nbextensions-0.7.0-py2.py3-none-any.whl
Collecting ipython-genutils
  Using cached ipython_genutils-0.2.0-py2.py3-none-any.whl (26 kB)
Collecting jupyter-nbextensions-configurator>=0.4.0
  Using cached jupyter_nbextensions_configurator-0.6.3-py2.py3-none-any.whl (466 kB)
Collecting jupyter-contrib-core>=0.3.3
  Using cached jupyter_contrib_core-0.4.2-py2.py3-none-any.whl
Collecting jupyter-highlight-selected-word>=0.1.1
  Using cached jupyter_highlight_selected_word-0.2.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: jupyter-highlight-selected-word, ipython-genutils, jupyter-contrib-core, jupyter-nbextensions-configurator, jupyter_contrib_nbextensions
Successfully installed ipython-genutils-0.2.0 jupyter-contrib-core-0.4.2 jupyter-highlight-selected-word-0.2.0 jupyter-nbextensions-configurator-0.6.3 jupyter_contrib_nbextensions-0.7.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
jupyter contrib nbextension install --user

SyntaxError: invalid syntax (1639211862.py, line 1)