In [406]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

  from pandas import MultiIndex, Int64Index


In [382]:
trainPath = 'train.csv'
testPath = 'test.csv'


dataDf = pd.read_csv(trainPath)
testDf = pd.read_csv(testPath)

#dataDf = pd.concat([trainDf, testDf])

In [383]:
dataDf

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Data Cleaning, Replacing NaN values, feature extraction

Check if there are columns with NaN values

In [384]:
print(dataDf.columns[dataDf.isna().any()].tolist())

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']


## Surnames extraction for families
Name - The first and last names of the passenger.

In [385]:
surnameList = []
for i in range(len(dataDf['Name'])):
    fullName = dataDf['Name'].iloc[i]
    if fullName != fullName:
        surnameList.append('unknown')
    else:
        for j in range(len(fullName)):
            if fullName[j] == " ":
                surnameList.append(fullName[j+1:])
dataDf['surname'] = surnameList

dataDf = dataDf.drop('Name', axis=1)

## PassangerID extraction for groups
PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

In [386]:
groupList = []
for i in range(len(dataDf['PassengerId'])):
    fullId = dataDf['PassengerId'].iloc[i]
    if fullId != fullId:
        groupList.append('PassengerId')
    else:
        for j in range(len(fullId)):
            if fullId[j] == "_":
                groupList.append(fullId[:j])
dataDf['groups'] = groupList

dataDf = dataDf.drop('PassengerId', axis=1)

## Cabin dismantling 
Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

In [387]:
numList = []
deckList = []
sideList = []

for i in range(len(dataDf['Cabin'])):
    fullCabinId = dataDf['Cabin'].iloc[i]
    if fullCabinId != fullCabinId:
        numList.append('unknown')
        deckList.append('unknown')
        sideList.append('unknown')
    else:
        num = re.findall('\d+', fullCabinId)
        numList.append(num[0])
        
        deck = fullCabinId[0]
        deckList.append(deck)
        
        side = fullCabinId[-1]
        sideList.append(side)

dataDf['num'] = numList
dataDf['deck'] = deckList
dataDf['side'] = sideList

dataDf = dataDf.drop('Cabin', axis=1)

## Replace NaN float values

Age - The age of the passenger.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

In [388]:
floatValueNames = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [389]:
for i in floatValueNames:
    meanValue = dataDf[i].mean()
    dataDf[i] = dataDf[i].fillna(meanValue)
print(dataDf.columns[dataDf.isna().any()].tolist())

['HomePlanet', 'CryoSleep', 'Destination', 'VIP']


## Replacing last NaN Values

In [390]:
dataDf = dataDf.fillna('unknown')
print(dataDf.columns[dataDf.isna().any()].tolist())

[]


In [391]:
dataDf

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,surname,groups,num,deck,side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,Ofracculy,0001,0,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,Vines,0002,0,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,Susent,0003,0,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,Susent,0003,0,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,Santantines,0004,1,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,Noxnuther,9276,98,A,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,Mondalley,9278,1499,G,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,Connon,9279,1500,G,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,Hontichre,9280,608,E,S


# Label Encoding 

In [392]:
dataDf['VIP'] = dataDf['VIP'].astype(str)
dataDf['CryoSleep'] = dataDf['CryoSleep'].astype(str)
dataDf['Transported'] = dataDf['Transported'].astype(str)

In [393]:
tmpDataDf = dataDf[['HomePlanet', 'Destination', 'surname', 'num', 'deck', 'side', 'groups', 'VIP', 'CryoSleep', 'Transported']]

dataDf[['HomePlanet', 'Destination', 'surname', 'num', 'deck', 'side', 'groups', 'VIP', 'CryoSleep', 'Transported']] = tmpDataDf.apply(LabelEncoder().fit_transform)

#dataDf = dataDf.apply(LabelEncoder().fit_transform)

In [394]:
dataDf

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,surname,groups,num,deck,side
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1431,0,0,1,0
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2109,1,0,5,1
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,1990,2,0,0,1
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,1990,2,0,0,1
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1778,3,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0,1416,6213,1795,0,0
8689,0,1,1,18.0,0,0.0,0.0,0.0,0.0,0.0,0,1341,6214,548,6,1
8690,0,0,2,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,470,6215,551,6,1
8691,1,0,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0,996,6216,1385,4,1


# Creating XGBoost Model for training

## Train Test Split

In [408]:
train, test = train_test_split(dataDf, test_size=0.2)

Data - Label Split

In [412]:
trainData = train.loc[:, train.columns != 'Transported']
trainLabel = train.loc[:, train.columns == 'Transported']

testData = test.loc[:, test.columns != 'Transported']
testLabel = test.loc[:, test.columns == 'Transported']

Converting to XGBoost Format

In [413]:
dtrain = xgb.DMatrix(trainData, label=trainLabel)
dtest = xgb.DMatrix(testData, label=testLabel)

## Hyperparameter

In [437]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic', 'num_parallel_tree': 5}
param['nthread'] = 4
param['eval_metric'] = 'auc'
num_round = 50

evallist = [(dtest, 'eval'), (dtrain, 'train')]

In [438]:

bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-auc:0.75031	train-auc:0.77870
[1]	eval-auc:0.79676	train-auc:0.83182
[2]	eval-auc:0.82160	train-auc:0.85300
[3]	eval-auc:0.83133	train-auc:0.86247
[4]	eval-auc:0.83635	train-auc:0.86989
[5]	eval-auc:0.84597	train-auc:0.87960
[6]	eval-auc:0.84715	train-auc:0.88420
[7]	eval-auc:0.85049	train-auc:0.88770
[8]	eval-auc:0.85398	train-auc:0.89055
[9]	eval-auc:0.86023	train-auc:0.89549
[10]	eval-auc:0.86253	train-auc:0.89729
[11]	eval-auc:0.86325	train-auc:0.89924
[12]	eval-auc:0.86321	train-auc:0.90037
[13]	eval-auc:0.86198	train-auc:0.90251
[14]	eval-auc:0.86366	train-auc:0.90390
[15]	eval-auc:0.86418	train-auc:0.90502
[16]	eval-auc:0.86534	train-auc:0.90596
[17]	eval-auc:0.86350	train-auc:0.90676
[18]	eval-auc:0.86333	train-auc:0.90698
[19]	eval-auc:0.86888	train-auc:0.90900
[20]	eval-auc:0.86797	train-auc:0.91005
[21]	eval-auc:0.86794	train-auc:0.91106
[22]	eval-auc:0.86807	train-auc:0.91153
[23]	eval-auc:0.86891	train-auc:0.91271
[24]	eval-auc:0.86875	train-auc:0.91351
[25]	eval-

In [428]:
testDf

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale
