# Spaceship titanic

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.impute import KNNImputer

## Importing datasets

In [2]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')


## Creating a dummy column that will not be in test dataset

In [3]:
test_data['Transported']=False

### before performing any preprocessing we make sure that the datasets are in the same domain

In [4]:
data = pd.concat([train_data,test_data],sort = False)
data.drop(['Name'],axis=1,inplace=True)

In [5]:
data.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


If this is true, you have concatenated perfectly

In [6]:
data.shape[0] == train_data.shape[0] + test_data.shape[0]

True

In [7]:
data.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Transported       0
dtype: int64

In [8]:
data[['Deck', 'Num', 'Side']]=data['Cabin'].str.split('/', expand=True)

splitting cabin column into 3


In [9]:
data= data.drop(columns=["Cabin"])

In [10]:
data.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [11]:
data['Deck']=data['Deck'].fillna('U')
data['Num']=data['Num'].fillna(-1)
data['Side']=data['Side'].fillna('U')
data.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Transported       0
Deck              0
Num               0
Side              0
dtype: int64

In [12]:
data['Deck'].value_counts()

Deck
F    4239
G    3781
E    1323
B    1141
C    1102
D     720
A     354
U     299
T      11
Name: count, dtype: int64

In [13]:
data['Deck']=data['Deck'].map({'G':0,'F':1,'E':2,'D':3,'C':4,'B':5,'A':6,'U':7,'T':8})

In [14]:
data['Side'].value_counts()

Side
S    6381
P    6290
U     299
Name: count, dtype: int64

In [15]:
data['Side']=data['Side'].map({'S':2,'P':1,'U':-1})

In [16]:
impute_list=['PassengerId','CryoSleep','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck', 'Age','Deck','Num','Side']
rest=list(set(data.columns)-set(impute_list))
data_rest=data[rest]
imp=KNNImputer(n_neighbors=5)
data_imputed=imp.fit_transform(data[impute_list])
data_imputed=pd.DataFrame(data_imputed,columns=impute_list)
data=pd.concat([data_rest.reset_index(drop=True),data_imputed.reset_index(drop=True)],axis=1)

In [17]:
data.isna().sum()

HomePlanet      288
Destination     274
Transported       0
PassengerId       0
CryoSleep         0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Age               0
Deck              0
Num               0
Side              0
dtype: int64

In [18]:
data['HomePlanet']=data['HomePlanet'].fillna('Unknown')
data['Destination']=data['Destination'].fillna('Unknown')

category_colls = ['HomePlanet', 'Destination']

for col in category_colls:
    data = pd.concat([data, pd.get_dummies(data[col], prefix = col)], axis = 1)

In [19]:
bill_cols=['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
data['amountspent']=data[bill_cols].sum(axis=1)
data['standardamountspent']=data[bill_cols].std(axis=1)
data['mean_amt_spent'] = data[bill_cols].mean(axis = 1)

In [20]:
data = data.drop(columns = category_colls)

In [21]:
data.head()

Unnamed: 0,Transported,PassengerId,CryoSleep,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Age,...,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_Unknown,amountspent,standardamountspent,mean_amt_spent
0,False,101.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39.0,...,True,False,False,False,False,True,False,0.0,0.0,0.0
1,True,201.0,0.0,0.0,109.0,9.0,25.0,549.0,44.0,24.0,...,False,False,False,False,False,True,False,736.0,227.807375,147.2
2,False,301.0,0.0,1.0,43.0,3576.0,0.0,6715.0,49.0,58.0,...,True,False,False,False,False,True,False,10383.0,3013.383198,2076.6
3,False,302.0,0.0,0.0,0.0,1283.0,371.0,3329.0,193.0,33.0,...,True,False,False,False,False,True,False,5176.0,1373.410427,1035.2
4,True,401.0,0.0,0.0,303.0,70.0,151.0,565.0,2.0,16.0,...,False,False,False,False,False,True,False,1091.0,223.988169,218.2


In [22]:
# Remove duplicate columns
data = data.loc[:, ~data.columns.duplicated()]

# Ensure 'mean_amt_spent' column exists
if 'mean_amt_spent' not in data.columns:
	data['mean_amt_spent'] = data[bill_cols].mean(axis=1)

# Create new columns based on the existing ones
data['3_high_cols'] = data['CryoSleep'] + data['HomePlanet_Europa'] + data['Destination_55 Cancri e']
data['3_low_cols'] = data['mean_amt_spent'] + data['amountspent'] + data['HomePlanet_Earth']

In [23]:
data.corr()['Transported'].sort_values(ascending = False)

Transported                  1.000000
CryoSleep                    0.322575
3_high_cols                  0.283061
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Deck                         0.077959
Side                         0.059872
FoodCourt                    0.035712
PassengerId                  0.014628
HomePlanet_Unknown           0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.005382
Destination_PSO J318.5-22    0.000760
Destination_Unknown         -0.000554
VIP                         -0.018192
Num                         -0.035240
Age                         -0.050253
Destination_TRAPPIST-1e     -0.072731
standardamountspent         -0.118870
HomePlanet_Earth            -0.119644
mean_amt_spent              -0.139246
amountspent                 -0.139246
3_low_cols                  -0.139269
VRDeck                      -0.142306
Spa                         -0.154383
RoomService                 -0.174587
Name: Transp

In [24]:
train_data, test_data = data[:train_data.shape[0]], data[train_data.shape[0]:]
test_data = test_data.drop(columns = 'Transported')
train_data.shape, test_data.shape

((8693, 26), (4277, 25))

In [28]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [29]:
X = train_data.drop(columns = 'Transported')
y = train_data['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model_1 = LogisticRegression()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = XGBClassifier()
model_5 = LGBMClassifier()

In [30]:
model_1.fit(X_train, y_train)
pred = model_1.predict(X_test)
accuracy_score(y_test, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7665324899367453

In [31]:
model_2.fit(X_train, y_train)
pred = model_2.predict(X_test)
accuracy_score(y_test, pred)

0.7337550316273721

In [32]:
model_3.fit(X_train, y_train)
pred = model_3.predict(X_test)
accuracy_score(y_test, pred)

0.78953421506613

In [33]:
model_4.fit(X_train, y_train)  
pred = model_4.predict(X_test)
accuracy_score(y_test, pred)

0.7998849913743531

In [34]:
model_5.fit(X_train, y_train)
pred = model_5.predict(X_test)
accuracy_score(y_test, pred)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2962
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


0.8039102932719954

In [37]:
df_dummy = pd.read_csv('test.csv')
pred = model_5.predict(test_data)

final = pd.DataFrame()
final['PassengerId'] = df_dummy['PassengerId']
final['Transported'] = pred

final.to_csv('submission.csv', index = False)