# One-hot Encoding for categorical data

We try to apply one-hot encoding. The reason of not choosing label encoding is that the original categorical variables do not come with a natural ordering or ranking.

In [2]:
# First code block is the import libraries  
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [25]:
# dataPath = '../spaceship-titanic_rawData/spaceship_train.csv'
dataPath = '../preprocess_train_dataset/splittedData_01.csv'

df_splittedData_01 = pd.read_csv(dataPath)

df_splittedData_01

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin_deck,Cabin_num,Cabin_side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_spending,First_Name,Last_Name,Transported
0,0001_01,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Maham,Ofracculy,False
1,0002_01,Earth,False,F,0,S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,736.0,Juanna,Vines,True
2,0003_01,Europa,False,A,0,S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,Altark,Susent,False
3,0003_02,Europa,False,A,0,S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,Solam,Susent,False
4,0004_01,Earth,False,F,1,S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1091.0,Willy,Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A,98,P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,8536.0,Gravior,Noxnuther,False
8689,9278_01,Earth,True,G,1499,S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,0.0,Kurta,Mondalley,False
8690,9279_01,Earth,False,G,1500,S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,1873.0,Fayey,Connon,True
8691,9280_01,Europa,False,E,608,S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,4637.0,Celeon,Hontichre,False


Attributes of encoding: <b>HomePlanet</b>, <b>Cabin_deck</b>, <b>Cabin_side</b>, <b>Destination</b>

In [26]:
onehotencoder = OneHotEncoder()
ohe_HomePlanet=onehotencoder.fit_transform(df_splittedData_01).toarray()

In [27]:
ohe_HomePlanet

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [28]:
ohe_HomePlanet = pd.get_dummies(df_splittedData_01['HomePlanet'])
ohe_Cabin_deck = pd.get_dummies(df_splittedData_01['Cabin_deck'])
ohe_Cabin_side = pd.get_dummies(df_splittedData_01['Cabin_side'])
ohe_Destination = pd.get_dummies(df_splittedData_01['Destination'])


pd.DataFrame(ohe_Destination)

Unnamed: 0,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,False,False,True
1,False,False,True
2,False,False,True
3,False,False,True
4,False,False,True
...,...,...,...
8688,True,False,False
8689,False,True,False
8690,False,False,True
8691,True,False,False


In [29]:
# concatenating along columns
horizontal_concat = pd.concat([df_splittedData_01, ohe_HomePlanet, ohe_Cabin_deck, ohe_Cabin_side, ohe_Destination], axis=1)

In [30]:
horizontal_concat.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin_deck', 'Cabin_num',
       'Cabin_side', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'Total_spending', 'First_Name',
       'Last_Name', 'Transported', 'Earth', 'Europa', 'Mars', 'A', 'B', 'C',
       'D', 'E', 'F', 'G', 'T', 'P', 'S', '55 Cancri e', 'PSO J318.5-22',
       'TRAPPIST-1e'],
      dtype='object')

In [31]:
df_rename = horizontal_concat.rename(columns={
    # Cabin_deck
    'A': 'Cabin_deck_A', 
    'B': 'Cabin_deck_B', 
    'C': 'Cabin_deck_C', 
    'D': 'Cabin_deck_D', 
    'E': 'Cabin_deck_E', 
    'F': 'Cabin_deck_F', 
    'G': 'Cabin_deck_G', 
    'T': 'Cabin_deck_T',

    # Cabin_side
    'P': 'Cabin_side_P', 
    'S': 'Cabin_side_S', 

    # Destination
    # '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 

}, inplace=False)

In [32]:
df_rename

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin_deck,Cabin_num,Cabin_side,Destination,Age,VIP,RoomService,...,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_P,Cabin_side_S,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,0001_01,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,...,False,False,False,False,False,True,False,False,False,True
1,0002_01,Earth,False,F,0,S,TRAPPIST-1e,24.0,False,109.0,...,False,False,True,False,False,False,True,False,False,True
2,0003_01,Europa,False,A,0,S,TRAPPIST-1e,58.0,True,43.0,...,False,False,False,False,False,False,True,False,False,True
3,0003_02,Europa,False,A,0,S,TRAPPIST-1e,33.0,False,0.0,...,False,False,False,False,False,False,True,False,False,True
4,0004_01,Earth,False,F,1,S,TRAPPIST-1e,16.0,False,303.0,...,False,False,True,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A,98,P,55 Cancri e,41.0,True,0.0,...,False,False,False,False,False,True,False,True,False,False
8689,9278_01,Earth,True,G,1499,S,PSO J318.5-22,18.0,False,0.0,...,False,False,False,True,False,False,True,False,True,False
8690,9279_01,Earth,False,G,1500,S,TRAPPIST-1e,26.0,False,0.0,...,False,False,False,True,False,False,True,False,False,True
8691,9280_01,Europa,False,E,608,S,55 Cancri e,32.0,False,0.0,...,False,True,False,False,False,False,True,True,False,False


There are too many different names. So we will apply label encoding on that to avoid too high dimensions of data

In [33]:
labelencoder = LabelEncoder()
data_le=pd.DataFrame(df_rename)
data_le['First_Name_le'] = labelencoder.fit_transform(data_le['First_Name'])
data_le['Last_Name_le'] = labelencoder.fit_transform(data_le['Last_Name'])
data_le

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin_deck,Cabin_num,Cabin_side,Destination,Age,VIP,RoomService,...,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_P,Cabin_side_S,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,First_Name_le,Last_Name_le
0,0001_01,Europa,False,B,0,P,TRAPPIST-1e,39.0,False,0.0,...,False,False,False,True,False,False,False,True,1614,1431
1,0002_01,Earth,False,F,0,S,TRAPPIST-1e,24.0,False,109.0,...,True,False,False,False,True,False,False,True,1407,2109
2,0003_01,Europa,False,A,0,S,TRAPPIST-1e,58.0,True,43.0,...,False,False,False,False,True,False,False,True,156,1990
3,0003_02,Europa,False,A,0,S,TRAPPIST-1e,33.0,False,0.0,...,False,False,False,False,True,False,False,True,2276,1990
4,0004_01,Earth,False,F,1,S,TRAPPIST-1e,16.0,False,303.0,...,True,False,False,False,True,False,False,True,2642,1778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A,98,P,55 Cancri e,41.0,True,0.0,...,False,False,False,True,False,True,False,False,1135,1416
8689,9278_01,Earth,True,G,1499,S,PSO J318.5-22,18.0,False,0.0,...,False,True,False,False,True,False,True,False,1492,1341
8690,9279_01,Earth,False,G,1500,S,TRAPPIST-1e,26.0,False,0.0,...,False,True,False,False,True,False,False,True,955,470
8691,9280_01,Europa,False,E,608,S,55 Cancri e,32.0,False,0.0,...,False,False,False,False,True,True,False,False,526,996


In [37]:
df_AllData = data_le[
    [   'PassengerId', 'HomePlanet', 'Earth', 'Europa', 'Mars', 
        'CryoSleep',
        'Cabin_deck', 
            'Cabin_deck_A', 
            'Cabin_deck_B', 
            'Cabin_deck_C', 
            'Cabin_deck_D', 
            'Cabin_deck_E', 
            'Cabin_deck_F', 
            'Cabin_deck_G', 
            'Cabin_deck_T',
        'Cabin_num', 
        'Cabin_side', 
            'Cabin_side_P', 
            'Cabin_side_S',
        'Destination', '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 
        'Age',
        'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Total_spending',
        'First_Name', 'Last_Name', 'First_Name_le', 'Last_Name_le',
        'Transported'
    ]
]
df_AllData

Unnamed: 0,PassengerId,HomePlanet,Earth,Europa,Mars,CryoSleep,Cabin_deck,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,...,FoodCourt,ShoppingMall,Spa,VRDeck,Total_spending,First_Name,Last_Name,First_Name_le,Last_Name_le,Transported
0,0001_01,Europa,False,True,False,False,B,False,True,False,...,0.0,0.0,0.0,0.0,0.0,Maham,Ofracculy,1614,1431,False
1,0002_01,Earth,True,False,False,False,F,False,False,False,...,9.0,25.0,549.0,44.0,736.0,Juanna,Vines,1407,2109,True
2,0003_01,Europa,False,True,False,False,A,True,False,False,...,3576.0,0.0,6715.0,49.0,10383.0,Altark,Susent,156,1990,False
3,0003_02,Europa,False,True,False,False,A,True,False,False,...,1283.0,371.0,3329.0,193.0,5176.0,Solam,Susent,2276,1990,False
4,0004_01,Earth,True,False,False,False,F,False,False,False,...,70.0,151.0,565.0,2.0,1091.0,Willy,Santantines,2642,1778,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,True,False,False,A,True,False,False,...,6819.0,0.0,1643.0,74.0,8536.0,Gravior,Noxnuther,1135,1416,False
8689,9278_01,Earth,True,False,False,True,G,False,False,False,...,0.0,0.0,0.0,0.0,0.0,Kurta,Mondalley,1492,1341,False
8690,9279_01,Earth,True,False,False,False,G,False,False,False,...,0.0,1872.0,1.0,0.0,1873.0,Fayey,Connon,955,470,True
8691,9280_01,Europa,False,True,False,False,E,False,False,False,...,1049.0,0.0,353.0,3235.0,4637.0,Celeon,Hontichre,526,996,False


In [38]:
# Export encoded data

file_name = '../preprocess_train_dataset/encodedData_01.csv'

df_AllData.to_csv(file_name, sep=',', encoding='utf-8', index=False)

Preprocess Test Data

In [6]:
# preprocess test dataset

testdataPath = '../preprocess_test_dataset/splittedTestData_01.csv'


df_testsp = pd.read_csv(testdataPath)

df_testsp

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin_deck,Cabin_num,Cabin_side,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_spending,First_Name,Last_Name
0,0013_01,Earth,True,G,3,S,TRAPPIST-1e,27.000000,False,0.0,0.0,0.0,0.0,0.0,0.0,Nelly,Carsoning
1,0018_01,Earth,False,F,4,S,TRAPPIST-1e,19.000000,False,0.0,9.0,0.0,2823.0,0.0,2832.0,Lerome,Peckers
2,0019_01,Europa,True,C,0,S,55 Cancri e,31.000000,False,0.0,0.0,0.0,0.0,0.0,0.0,Sabih,Unhearfus
3,0021_01,Europa,False,C,1,S,TRAPPIST-1e,38.000000,False,0.0,6652.0,0.0,181.0,585.0,7418.0,Meratz,Caltilter
4,0023_01,Earth,False,F,5,S,TRAPPIST-1e,20.000000,False,10.0,0.0,635.0,0.0,0.0,645.0,Brence,Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G,1496,S,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,0.0,Jeron,Peter
4273,9269_01,Earth,False,G,160,P,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,1018.0,Matty,Scheron
4274,9271_01,Mars,True,D,296,P,55 Cancri e,28.658146,False,0.0,0.0,0.0,0.0,0.0,0.0,Jayrin,Pore
4275,9273_01,Europa,False,D,297,P,TRAPPIST-1e,28.658146,False,0.0,2680.0,0.0,0.0,523.0,3203.0,Kitakan,Conale


In [7]:
onehotencoder = OneHotEncoder()
ohe_HomePlanet = onehotencoder.fit_transform(df_testsp).toarray()

In [8]:
ohe_HomePlanet = pd.get_dummies(df_testsp['HomePlanet'])
ohe_Cabin_deck = pd.get_dummies(df_testsp['Cabin_deck'])
ohe_Cabin_side = pd.get_dummies(df_testsp['Cabin_side'])
ohe_Destination = pd.get_dummies(df_testsp['Destination'])


pd.DataFrame(ohe_Destination)

Unnamed: 0,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
0,False,False,True
1,False,False,True
2,True,False,False
3,False,False,True
4,False,False,True
...,...,...,...
4272,False,False,True
4273,False,False,True
4274,True,False,False
4275,False,False,True


In [9]:
# concatenating along columns
horizontal_concat = pd.concat([df_testsp, ohe_HomePlanet, ohe_Cabin_deck, ohe_Cabin_side, ohe_Destination], axis=1)


In [10]:
df_rename = horizontal_concat.rename(columns={
    # Cabin_deck
    'A': 'Cabin_deck_A', 
    'B': 'Cabin_deck_B', 
    'C': 'Cabin_deck_C', 
    'D': 'Cabin_deck_D', 
    'E': 'Cabin_deck_E', 
    'F': 'Cabin_deck_F', 
    'G': 'Cabin_deck_G', 
    'T': 'Cabin_deck_T',

    # Cabin_side
    'P': 'Cabin_side_P', 
    'S': 'Cabin_side_S', 

    # Destination
    # '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 

}, inplace=False)

labelencoder = LabelEncoder()
data_le=pd.DataFrame(df_rename)
data_le['First_Name_le'] = labelencoder.fit_transform(data_le['First_Name'])
data_le['Last_Name_le'] = labelencoder.fit_transform(data_le['Last_Name'])
data_le

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin_deck,Cabin_num,Cabin_side,Destination,Age,VIP,RoomService,...,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_P,Cabin_side_S,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,First_Name_le,Last_Name_le
0,0013_01,Earth,True,G,3,S,TRAPPIST-1e,27.000000,False,0.0,...,False,True,False,False,True,False,False,True,1465,275
1,0018_01,Earth,False,F,4,S,TRAPPIST-1e,19.000000,False,0.0,...,True,False,False,False,True,False,False,True,1219,1190
2,0019_01,Europa,True,C,0,S,55 Cancri e,31.000000,False,0.0,...,False,False,False,False,True,True,False,False,1709,1604
3,0021_01,Europa,False,C,1,S,TRAPPIST-1e,38.000000,False,0.0,...,False,False,False,False,True,False,False,True,1364,262
4,0023_01,Earth,False,F,5,S,TRAPPIST-1e,20.000000,False,10.0,...,True,False,False,False,True,False,False,True,366,736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G,1496,S,TRAPPIST-1e,34.000000,False,0.0,...,False,True,False,False,True,False,False,True,1091,1209
4273,9269_01,Earth,False,G,160,P,TRAPPIST-1e,42.000000,False,0.0,...,False,True,False,True,False,False,False,True,1335,1401
4274,9271_01,Mars,True,D,296,P,55 Cancri e,28.658146,False,0.0,...,False,False,False,True,False,True,False,False,1077,1241
4275,9273_01,Europa,False,D,297,P,TRAPPIST-1e,28.658146,False,0.0,...,False,False,False,True,False,False,False,True,1182,362


In [11]:
df_AllData = data_le[
    [   'PassengerId', 'HomePlanet', 'Earth', 'Europa', 'Mars', 
        'CryoSleep',
        'Cabin_deck', 
            'Cabin_deck_A', 
            'Cabin_deck_B', 
            'Cabin_deck_C', 
            'Cabin_deck_D', 
            'Cabin_deck_E', 
            'Cabin_deck_F', 
            'Cabin_deck_G', 
            'Cabin_deck_T',
        'Cabin_num', 
        'Cabin_side', 
            'Cabin_side_P', 
            'Cabin_side_S',
        'Destination', '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 
        'Age',
        'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Total_spending',
        'First_Name', 'Last_Name', 'First_Name_le', 'Last_Name_le'
    ]
]

In [12]:
df_AllData.head()

Unnamed: 0,PassengerId,HomePlanet,Earth,Europa,Mars,CryoSleep,Cabin_deck,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,...,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_spending,First_Name,Last_Name,First_Name_le,Last_Name_le
0,0013_01,Earth,True,False,False,True,G,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,Nelly,Carsoning,1465,275
1,0018_01,Earth,True,False,False,False,F,False,False,False,...,0.0,9.0,0.0,2823.0,0.0,2832.0,Lerome,Peckers,1219,1190
2,0019_01,Europa,False,True,False,True,C,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,Sabih,Unhearfus,1709,1604
3,0021_01,Europa,False,True,False,False,C,False,False,True,...,0.0,6652.0,0.0,181.0,585.0,7418.0,Meratz,Caltilter,1364,262
4,0023_01,Earth,True,False,False,False,F,False,False,False,...,10.0,0.0,635.0,0.0,0.0,645.0,Brence,Harperez,366,736


In [13]:
# Export encoded test data

file_name = '../preprocess_test_dataset/encodedData_01.csv'

df_AllData.to_csv(file_name, sep=',', encoding='utf-8', index=False)