In [9]:
# First code block is the import libraries  
import numpy as np
import pandas as pd
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [3]:
dataPath = '../../preprocess_dataset/encodedData_01.csv'

df_spaceship = pd.read_csv(dataPath)

df_spaceship.shape

(8693, 36)

In [4]:
train_x, train_y = df_spaceship.drop(columns=['Transported']), df_spaceship['Transported']

In [5]:
# drop categorical data, remain the one hot encoded ones 
train_x.drop(['PassengerId', 'HomePlanet', 'Cabin_deck', 'Cabin_side', 'Destination', 'First_Name', 'First_Name_le', 'Last_Name'], axis=1, inplace=True)



In [8]:
train_x.columns

Index(['Earth', 'Europa', 'Mars', 'CryoSleep', 'Cabin_deck_A', 'Cabin_deck_B',
       'Cabin_deck_C', 'Cabin_deck_D', 'Cabin_deck_E', 'Cabin_deck_F',
       'Cabin_deck_G', 'Cabin_deck_T', 'Cabin_num', 'Cabin_side_P',
       'Cabin_side_S', '55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Total_spending', 'Last_Name_le'],
      dtype='object')

In [10]:
k_folds = KFold(n_splits = 8)

logisticReg_model = LogisticRegression(solver = 'liblinear', class_weight = 'balanced', max_iter =300, penalty = 'l1')

scores = cross_val_score(logisticReg_model, train_x, train_y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.78104876 0.75620975 0.79300828 0.80588776 0.79484821 0.77348066
 0.81860037 0.7771639 ]

Average CV Score:  0.7875309619291104


In [13]:
# drop the spending activities, only keep the total spending
train_x1 = train_x.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1, inplace=False)
train_x1

Unnamed: 0,Earth,Europa,Mars,CryoSleep,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,...,Cabin_num,Cabin_side_P,Cabin_side_S,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,Age,VIP,Total_spending,Last_Name_le
0,False,True,False,False,False,True,False,False,False,False,...,0,True,False,False,False,True,39.0,False,0.0,1431
1,True,False,False,False,False,False,False,False,False,True,...,0,False,True,False,False,True,24.0,False,736.0,2109
2,False,True,False,False,True,False,False,False,False,False,...,0,False,True,False,False,True,58.0,True,10383.0,1990
3,False,True,False,False,True,False,False,False,False,False,...,0,False,True,False,False,True,33.0,False,5176.0,1990
4,True,False,False,False,False,False,False,False,False,True,...,1,False,True,False,False,True,16.0,False,1091.0,1778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,True,False,False,True,False,False,False,False,False,...,98,True,False,True,False,False,41.0,True,8536.0,1416
8689,True,False,False,True,False,False,False,False,False,False,...,1499,False,True,False,True,False,18.0,False,0.0,1341
8690,True,False,False,False,False,False,False,False,False,False,...,1500,False,True,False,False,True,26.0,False,1873.0,470
8691,False,True,False,False,False,False,False,False,True,False,...,608,False,True,True,False,False,32.0,False,4637.0,996


In [14]:
# result of dropping the consumption details
scores = cross_val_score(logisticReg_model, train_x1, train_y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.71941122 0.71573137 0.75436983 0.72033119 0.71205152 0.71086556
 0.75782689 0.72744015]

Average CV Score:  0.7272534651100144


In [15]:
# use the normalized data for cross validation

normalizedDataPath = '../../preprocess_dataset/normalizedData_01.csv'

df_normalizedSpaceship = pd.read_csv(normalizedDataPath)

df_normalizedSpaceship.shape

(8693, 36)

In [19]:
df_normalizedSpaceship

Unnamed: 0,PassengerId,HomePlanet,Earth,Europa,Mars,CryoSleep,Cabin_deck,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,...,FoodCourt,ShoppingMall,Spa,VRDeck,Total_spending,First_Name,Last_Name,First_Name_le,Last_Name_le,Transported
0,0001_01,Europa,False,True,False,False,B,False,True,False,...,0.000000,0.000000,0.000000,0.000000,0.000000,Maham,Ofracculy,1614,1431,False
1,0002_01,Earth,True,False,False,False,F,False,False,False,...,0.000302,0.001064,0.024500,0.001823,0.020452,Juanna,Vines,1407,2109,True
2,0003_01,Europa,False,True,False,False,A,True,False,False,...,0.119948,0.000000,0.299670,0.002030,0.288521,Altark,Susent,156,1990,False
3,0003_02,Europa,False,True,False,False,A,True,False,False,...,0.043035,0.015793,0.148563,0.007997,0.143830,Solam,Susent,2276,1990,False
4,0004_01,Earth,True,False,False,False,F,False,False,False,...,0.002348,0.006428,0.025214,0.000083,0.030317,Willy,Santantines,2642,1778,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,True,False,False,A,True,False,False,...,0.228726,0.000000,0.073322,0.003066,0.237197,Gravior,Noxnuther,1135,1416,False
8689,9278_01,Earth,True,False,False,True,G,False,False,False,...,0.000000,0.000000,0.000000,0.000000,0.000000,Kurta,Mondalley,1492,1341,False
8690,9279_01,Earth,True,False,False,False,G,False,False,False,...,0.000000,0.079687,0.000045,0.000000,0.052047,Fayey,Connon,955,470,True
8691,9280_01,Europa,False,True,False,False,E,False,False,False,...,0.035186,0.000000,0.015753,0.134049,0.128852,Celeon,Hontichre,526,996,False


In [20]:
# drop categorical data, remain the encoded ones 
train_nx, train_ny = df_normalizedSpaceship.drop(columns=['Transported']), df_spaceship['Transported']
train_nx.drop(['PassengerId', 'HomePlanet', 'Cabin_deck', 'Cabin_side', 'Destination', 'First_Name', 'First_Name_le', 'Last_Name'], axis=1, inplace=True)


In [24]:
scores = cross_val_score(logisticReg_model, train_nx, train_ny, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.77092916 0.75712971 0.79944802 0.8049678  0.79300828 0.76519337
 0.8213628  0.77532228]

Average CV Score:  0.7859201792149308


In [25]:
# drop the spending activities, only keep the total spending
train_nx1 = train_nx.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1, inplace=False)
train_nx1

Unnamed: 0,Earth,Europa,Mars,CryoSleep,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,...,Cabin_num,Cabin_side_P,Cabin_side_S,55 Cancri e,PSO J318.5-22,TRAPPIST-1e,Age,VIP,Total_spending,Last_Name_le
0,False,True,False,False,False,True,False,False,False,False,...,0,True,False,False,False,True,0.493671,False,0.000000,1431
1,True,False,False,False,False,False,False,False,False,True,...,0,False,True,False,False,True,0.303797,False,0.020452,2109
2,False,True,False,False,True,False,False,False,False,False,...,0,False,True,False,False,True,0.734177,True,0.288521,1990
3,False,True,False,False,True,False,False,False,False,False,...,0,False,True,False,False,True,0.417722,False,0.143830,1990
4,True,False,False,False,False,False,False,False,False,True,...,1,False,True,False,False,True,0.202532,False,0.030317,1778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,True,False,False,True,False,False,False,False,False,...,98,True,False,True,False,False,0.518987,True,0.237197,1416
8689,True,False,False,True,False,False,False,False,False,False,...,1499,False,True,False,True,False,0.227848,False,0.000000,1341
8690,True,False,False,False,False,False,False,False,False,False,...,1500,False,True,False,False,True,0.329114,False,0.052047,470
8691,False,True,False,False,False,False,False,False,True,False,...,608,False,True,True,False,False,0.405063,False,0.128852,996


In [26]:
# result of dropping the consumption details
scores = cross_val_score(logisticReg_model, train_nx1, train_ny, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.71941122 0.71481141 0.75436983 0.71941122 0.71021159 0.70994475
 0.75690608 0.72559853]

Average CV Score:  0.7263330783527406
