In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

warnings.filterwarnings('ignore')

In [2]:
# URL's for test and train data
train_url = 'https://raw.githubusercontent.com/bksat90kc/KaggleChallenge/main/train.csv'
test_url = 'https://raw.githubusercontent.com/bksat90kc/KaggleChallenge/main/test.csv'

In [3]:
# read the train data
train_df = pd.read_csv(train_url)
train_df.drop('Name', axis=1, inplace=True)

In [4]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [5]:
# dataframe information
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 823.6+ KB


In [6]:
# unique values in HomePlanet column
train_df.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [7]:
# unique values in CryoSleep column
train_df.CryoSleep.unique()

array([False, True, nan], dtype=object)

In [8]:
# unique values in Destination column
train_df.Destination.unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [9]:
# unique values in VIP column
train_df.VIP.unique()

array([False, True, nan], dtype=object)

In [10]:
# unique values in Transported column
train_df.Transported.unique()

array([False,  True])

In [11]:
# replace NaN values in dataframe
values = {'HomePlanet': 'unknown', 'CryoSleep': 'False',
          'Destination': 'unknown', 'VIP': 'False', 'RoomService': 0,
          'FoodCourt': 0, 'ShoppingMall': 0, 'Spa':0, 'VRDeck': 0, 'Age': 0}
train_df.fillna(value=values, inplace=True)

In [12]:
# convert Transported column to numeric values : 0 for False and 1 for True
train_df.loc[train_df["Transported"] == False, "Transported"] = 0
train_df.loc[train_df["Transported"] == True, "Transported"] = 1

In [13]:
# convert VIP column to numeric values : 0 for False and 1 for True
train_df.loc[train_df["VIP"] == False, "VIP"] = 0
train_df.loc[train_df["VIP"] == 'False', "VIP"] = 0
train_df.loc[train_df["VIP"] == True, "VIP"] = 1

In [14]:
# convert CryoSleep column to numeric values : 0 for False and 1 for True
train_df.loc[train_df["CryoSleep"] == False, "CryoSleep"] = 0
train_df.loc[train_df["CryoSleep"] == 'False', "CryoSleep"] = 0
train_df.loc[train_df["CryoSleep"] == True, "CryoSleep"] = 1

In [None]:
# extract the cabin details
train_df[['Deck', 'CabinNum', 'Side']] = train_df['Cabin'].str.split('/', expand=True)

In [15]:
# one hot encoding for HomePlanet
oh1 = OneHotEncoder()
oh1.fit(train_df[['HomePlanet']])

In [16]:
# categories in HomePlanet
oh1.categories_

[array(['Earth', 'Europa', 'Mars', 'unknown'], dtype=object)]

In [17]:
# transform the one hot encoder object for HomePlanet
oh_hp = oh1.transform(train_df[['HomePlanet']]).toarray()


In [18]:
# shape of the output
print('Shape of one-hot encoded HomePlanet data:', oh_hp.shape)

Shape of one-hot encoded HomePlanet data: (8693, 4)


In [19]:
oh_hp

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [20]:
# new transformed data for the HomePlanet
oh_hp_df = pd.DataFrame(oh_hp, columns=['Earth', 'Europa', 'Mars', 'UnSrc'])
oh_hp_df.head()

Unnamed: 0,Earth,Europa,Mars,UnSrc
0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0


In [21]:
# concatenating two dataframes
train_df = pd.concat([train_df, oh_hp_df], axis=1, join='inner')

In [22]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Earth,Europa,Mars,UnSrc
0,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0
1,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,1.0,0.0,0.0,0.0
2,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0.0,1.0,0.0,0.0
3,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0.0,1.0,0.0,0.0
4,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1.0,0.0,0.0,0.0


In [23]:
# one hot encoding for HomePlanet
oh2 = OneHotEncoder()
oh2.fit(train_df[['Destination']])

In [24]:
# categories in Destination
oh2.categories_

[array(['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'unknown'],
       dtype=object)]

In [25]:
# transform the one hot encoder object for Destination
oh_des = oh2.transform(train_df[['Destination']]).toarray()

In [26]:
# shape of the output
oh_des.shape

(8693, 4)

In [27]:
oh_des

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]])

In [28]:
# new transformed data for the HomePlanet
oh_des_df = pd.DataFrame(oh_hp, columns=['55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'UnDes'])
oh_des_df.head()

Unnamed: 0,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,UnDes
0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0


In [29]:
# concatenating two dataframes
train_df = pd.concat([train_df, oh_des_df], axis=1, join='inner')

In [32]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Europa,Mars,UnSrc,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,UnDes,Deck,CabinNum,Side
0,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,B,0,P
1,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,F,0,S
2,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,A,0,S
3,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,A,0,S
4,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,F,1,S


In [None]:
# convert the data type to integer
columns = ['Earth', 'Europa', 'Mars', 'UnSrc', '55 Cancri e', 'PSO J318.5-22',
           'TRAPPIST-1e', 'UnDes', 'Transported', 'CryoSleep', 'VIP']

for col in columns:
  train_df[col] = train_df[col].astype('int64')

In [None]:
# Change the index to PassengerId
train_df.index = train_df['PassengerId']

In [None]:
train_df.head()

Unnamed: 0_level_0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Transported,Earth,Europa,Mars,UnSrc,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,UnDes
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0001_01,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,...,0.0,0,0,1,0,0,0,1,0,0
0002_01,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,...,44.0,1,1,0,0,0,1,0,0,0
0003_01,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,...,49.0,0,0,1,0,0,0,1,0,0
0003_02,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,...,193.0,0,0,1,0,0,0,1,0,0
0004_01,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,...,2.0,1,1,0,0,0,1,0,0,0


In [None]:
train_df.drop(['HomePlanet', 'Destination', 'Cabin', 'PassengerId'], axis=1, inplace=True)

In [None]:
# extract required train data
X = train_df.drop('Transported', axis=1)
y = train_df[['Transported']]

In [None]:
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=2)

In [None]:
X_train.head()

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Earth,Europa,Mars,UnSrc,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,UnDes
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4785_01,1,29.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,1,0,0
7090_01,0,15.0,0,0.0,610.0,267.0,0.0,0.0,1,0,0,0,1,0,0,0
4296_01,1,35.0,0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0
8826_03,0,46.0,0,1384.0,229.0,0.0,1572.0,1.0,0,1,0,0,0,1,0,0
0818_01,0,71.0,1,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,1,0,0


In [None]:
columns = train_df.columns
for col in columns:
  check_nan = train_df[col].isnull().values.any()
  print(col, check_nan)

CryoSleep False
Age False
VIP False
RoomService False
FoodCourt False
ShoppingMall False
Spa False
VRDeck False
Transported False
Earth False
Europa False
Mars False
UnSrc False
55 Cancri e False
PSO J318.5-22 False
TRAPPIST-1e False
UnDes False


In [None]:
# logistic regression
clobj = LogisticRegression()
clobj.fit(X_train, y_train)

In [None]:
y_pred = clobj.predict(X_train)

In [None]:
y_train

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
4785_01,1
7090_01,0
4296_01,0
8826_03,0
0818_01,1
...,...
1165_02,0
2701_01,1
6997_02,0
2760_03,1


In [None]:
y_pred

array([1, 1, 1, ..., 0, 1, 1])

In [None]:
# read the test data
test_df = pd.read_csv(test_url)
test_df.drop('Name', axis=1, inplace=True)

In [None]:
# replace NaN values in dataframe
values = {'HomePlanet': 'unknown', 'CryoSleep': 'False',
          'Destination': 'unknown', 'VIP': 'False', 'RoomService': 0,
          'FoodCourt': 0, 'ShoppingMall': 0, 'Spa':0, 'VRDeck': 0, 'Age': 0}
test_df.fillna(value=values, inplace=True)

In [None]:
# convert VIP column to numeric values : 0 for False and 1 for True
test_df.loc[test_df["VIP"] == False, "VIP"] = 0
test_df.loc[test_df["VIP"] == 'False', "VIP"] = 0
test_df.loc[test_df["VIP"] == True, "VIP"] = 1

In [None]:
# convert CryoSleep column to numeric values : 0 for False and 1 for True
test_df.loc[test_df["CryoSleep"] == False, "CryoSleep"] = 0
test_df.loc[test_df["CryoSleep"] == 'False', "CryoSleep"] = 0
test_df.loc[test_df["CryoSleep"] == True, "CryoSleep"] = 1