In [268]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
from sklearn.preprocessing import LabelEncoder

In [269]:
trainPath = 'train.csv'
testPath = 'test.csv'


dataDf = pd.read_csv(trainPath)
testDf = pd.read_csv(testPath)

#dataDf = pd.concat([trainDf, testDf])

In [270]:
dataDf.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Data Cleaning, Replacing NaN values, feature extraction

Check if there are columns with NaN values

In [272]:
print(dataDf.columns[dataDf.isna().any()].tolist())

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']


## Surnames extraction for families
Name - The first and last names of the passenger.

In [276]:
surnameList = []
for i in range(len(dataDf['Name'])):
    fullName = dataDf['Name'].iloc[i]
    if fullName != fullName:
        surnameList.append('unknown')
    else:
        for j in range(len(fullName)):
            if fullName[j] == " ":
                surnameList.append(fullName[j+1:])
dataDf['surname'] = surnameList
#dataDf = dataDf.drop('Name', axis=1)

## PassangerID extraction for groups
PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

In [278]:
groupList = []
for i in range(len(dataDf['PassengerId'])):
    fullId = dataDf['PassengerId'].iloc[i]
    if fullId != fullId:
        groupList.append('PassengerId')
    else:
        for j in range(len(fullId)):
            if fullId[j] == "_":
                groupList.append(fullId[:j])
dataDf['groups'] = groupList

## Cabin dismantling 
Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

In [279]:
numList = []
deckList = []
sideList = []

for i in range(len(dataDf['Cabin'])):
    fullCabinId = dataDf['Cabin'].iloc[i]
    if fullCabinId != fullCabinId:
        numList.append('unknown')
        deckList.append('unknown')
        sideList.append('unknown')
    else:
        num = re.findall('\d+', fullCabinId)
        numList.append(num[0])
        
        deck = fullCabinId[0]
        deckList.append(deck)
        
        side = fullCabinId[-1]
        sideList.append(side)

dataDf['num'] = numList
dataDf['deck'] = deckList
dataDf['side'] = sideList

In [280]:
dataDf

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,surname,groups,num,deck,side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,Ofracculy,0001,0,B,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,Vines,0002,0,F,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,Susent,0003,0,A,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,Susent,0003,0,A,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,Santantines,0004,1,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,Noxnuther,9276,98,A,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,Mondalley,9278,1499,G,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,Connon,9279,1500,G,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,Hontichre,9280,608,E,S


# Label Encoding 

In [76]:
tmpDataDf = dataDf[['HomePlanet', 'CryoSleep', 'Destination', 'Transported', 'VIP', 'surname']]

dataDf[['HomePlanet', 'CryoSleep', 'Destination', 'Transported', 'VIP', 'surname']] = tmpDataDf.apply(LabelEncoder().fit_transform)