In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [4]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


## Process Cabin

In [5]:
# Extract deck
df['Deck'] = df['Cabin'].apply(lambda x: x.split("/")[0] if not pd.isna(x) else None)

# Extract side
df['Side'] = df['Cabin'].apply(lambda x: x.split("/")[-1] if not pd.isna(x) else None)

In [6]:
df['Deck'].value_counts(dropna = False)

Deck
F       2794
G       2559
E        876
B        779
C        747
D        478
A        256
None     199
T          5
Name: count, dtype: int64

In [7]:
df['Side'].value_counts(dropna = False)

Side
S       4288
P       4206
None     199
Name: count, dtype: int64

## Fill Missing Values

In [8]:
# Fill na median for 'Age' column
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)

# Fill na mode for 'HomePlanet' and 'Destination' columns
mode_home_planet = df['HomePlanet'].mode()[0]
df['HomePlanet'].fillna(mode_home_planet, inplace=True)

mode_destination = df['Destination'].mode()[0]
df['Destination'].fillna(mode_destination, inplace=True)

mode_deck = df['Deck'].mode()[0]
df['Deck'].fillna(mode_destination, inplace=True)

mode_side = df['Side'].mode()[0]
df['Side'].fillna(mode_destination, inplace=True)

# Fill na constant values for the remaining columns we are going to use
df['VIP'].fillna(False, inplace=True)
df['CryoSleep'].fillna(False, inplace=True)
df['VRDeck'].fillna(0, inplace=True)
df['RoomService'].fillna(0, inplace=True)
df['FoodCourt'].fillna(0, inplace=True)
df['ShoppingMall'].fillna(0, inplace=True)
df['Spa'].fillna(0, inplace=True)
df['VRDeck'].fillna(0, inplace=True)


df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['HomePlanet'].fillna(mode_home_planet, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
Deck              0
Side              0
dtype: int64

## Drop unused columns

In [9]:
df.drop(["PassengerId", "Cabin", "Name"], axis=1, inplace=True)

In [10]:
df.isna().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Deck            0
Side            0
dtype: int64

## Save dataset

In [11]:
df.to_csv("train_clean.csv", index=False)