In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
original_train_data = pd.read_csv('../input/spaceship-titanic/train.csv')

In [3]:
train_data = original_train_data.copy()

In [4]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [7]:
# We will drop missing values
train_data.dropna(axis = 0 , inplace=True)

In [8]:
train_data.drop(['PassengerId' , 'Name'],axis = 1,inplace=True)

In [9]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0
mean,28.894036,222.991674,478.958523,178.356494,313.16152,303.780048
std,14.533429,644.987936,1678.592291,576.328407,1144.016291,1127.142166
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,82.75,30.0,65.0,52.0
max,79.0,9920.0,29813.0,12253.0,22408.0,20336.0


In [10]:
corr = train_data.corr()
corr

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
Age,1.0,0.074783,0.135844,0.042314,0.12382,0.105031,-0.082553
RoomService,0.074783,1.0,-0.013614,0.060478,0.012472,-0.026002,-0.247291
FoodCourt,0.135844,-0.013614,1.0,-0.01232,0.215995,0.216997,0.055025
ShoppingMall,0.042314,0.060478,-0.01232,1.0,0.022168,0.000383,0.011602
Spa,0.12382,0.012472,0.215995,0.022168,1.0,0.149447,-0.219854
VRDeck,0.105031,-0.026002,0.216997,0.000383,0.149447,1.0,-0.20795
Transported,-0.082553,-0.247291,0.055025,0.011602,-0.219854,-0.20795,1.0


In [11]:
train_data.shape

(6606, 12)

In [12]:
# split predictors and labels
def splitData(data_set):
    x = data_set.drop('Transported' , axis = 1)
    y = data_set['Transported'].copy()
    return x , y

In [13]:
x_train, y_train = splitData(train_data)

In [14]:
x_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


**Split 'Cabin' column into 3 new columns ('deck' , 'num' , 'side') ⬇⬇**

In [15]:
def splitCabinFeature(x):
    z = x['Cabin'].str.split('/' , expand = True)
    x['deck'] = z[0]
    x['num'] = z[1]
    x['side'] = z[2]
    x.drop('Cabin' , axis = 1 , inplace = True)
    return x

In [16]:
#test_data.drop(['Cabin' , 'Age' , 'FoodCourt' , 'ShoppingMall']  , axis = 1 , inplace=True)

In [17]:
x_train = splitCabinFeature(x_train)
x_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,F,1,S


## Features Engineering

#### Encoding columns

In [18]:
# get columns name with 'object' data type

def getObjectColumns(x):
    ll = []
    for col in x.columns:
        if x[col].dtype == object:
            ll.append(col)
    return ll


In [19]:
# Encoding columns
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
encoders_list = []

ll = getObjectColumns(x_train)

for col in ll:
    x_train[col] = encoder.fit_transform(np.array(x_train[col]).reshape(-1,1))
    encoders_list.append(encoder)
    

In [20]:
x_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side
0,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,5.0,0.0,1.0
2,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,1.0
3,1.0,0.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,1.0
4,0.0,0.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,5.0,1.0,1.0


#### Scaling columns

In [21]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
scalers_list = []

l = ['RoomService'  , 'VRDeck']
for col in l:
    x_train[col] = scalar.fit_transform(np.array(x_train[col]).reshape(-1,1))
    scalers_list.append(scalar)
    

In [22]:
x_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side
0,1.0,0.0,2.0,39.0,0.0,-0.345756,0.0,0.0,0.0,-0.269534,1.0,0.0,0.0
1,0.0,0.0,2.0,24.0,0.0,-0.176748,9.0,25.0,549.0,-0.230494,5.0,0.0,1.0
2,1.0,0.0,2.0,58.0,1.0,-0.279083,3576.0,0.0,6715.0,-0.226058,0.0,0.0,1.0
3,1.0,0.0,2.0,33.0,0.0,-0.345756,1283.0,371.0,3329.0,-0.098291,0.0,0.0,1.0
4,0.0,0.0,2.0,16.0,0.0,0.124056,70.0,151.0,565.0,-0.267759,5.0,1.0,1.0


#### labels Encoding

In [23]:
labels_encoder = OrdinalEncoder() 
y_train = labels_encoder.fit_transform(np.array(y_train).reshape(-1,1))


In [24]:
y_train.dtype

dtype('float64')

In [25]:
y_train

array([[0.],
       [1.],
       [0.],
       ...,
       [1.],
       [0.],
       [1.]])

# Train Models

### SGD Classifier

In [26]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SGDClassifier(random_state=42)

In [27]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring="accuracy")  #([0.62988193, 0.51771117, 0.63760218])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.62988193, 0.51771117, 0.63760218])

In [28]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, x_train, y_train, cv=3)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [29]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_train_pred)

array([[1862, 1417],
       [1258, 2069]])

### SVM Classifier

In [30]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train , y_train)

  y = column_or_1d(y, warn=True)


SVC()

In [31]:
cross_val_score(svc, x_train, y_train, cv=3, scoring="accuracy")  #[0.64532243, 0.65667575, 0.66121708]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.64532243, 0.65667575, 0.66121708])

### DecisionTree Classifier

In [32]:
from sklearn.tree import DecisionTreeClassifier

dst = DecisionTreeClassifier(random_state = 42)
dst.fit(x_train , y_train)

DecisionTreeClassifier(random_state=42)

In [33]:
cross_val_score(dst, x_train, y_train, cv=3, scoring="accuracy")  #[0.71934605, 0.71707539, 0.74659401]

array([0.71934605, 0.71707539, 0.74659401])

### LogisticRegression Classifier

In [34]:
from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(max_iter = 1000 , solver = 'saga')
lgr.fit(x_train , y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(max_iter=1000, solver='saga')

In [35]:
cross_val_score(lgr , x_train , y_train  ,cv = 3 ) #[0.60990009, 0.64532243, 0.63396912]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.60990009, 0.64532243, 0.63396912])

In [36]:
y_train_pred = cross_val_predict(lgr, x_train, y_train, cv=3)
confusion_matrix(y_train, y_train_pred)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([[1271, 2008],
       [ 437, 2890]])

### RandomForest

In [37]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier( random_state=42)
rnd_clf.fit(x_train , y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(random_state=42)

In [38]:
cross_val_score(rnd_clf , x_train , y_train  ,cv = 3 )

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


array([0.75794732, 0.79291553, 0.77838329])

In [39]:
rnd_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### GradientBoosting

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state=42 , max_depth = 3).fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [41]:
cross_val_score(clf , x_train , y_train  ,cv = 3 )

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.77247956, 0.78928247, 0.81244323])

In [42]:
clf.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

### XGB

In [43]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier( learning_rate=1, objective='binary:logistic')
xgb_clf.fit(x_train , y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='', learning_rate=1,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [44]:
cross_val_score(xgb_clf, x_train, y_train, cv = 3 )

array([0.73978202, 0.75522252, 0.77384196])

### LGBMClassifier

In [45]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(objective='binary', random_state=42)
lgbm_clf.fit(x_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(objective='binary', random_state=42)

In [46]:
cross_val_score(lgbm_clf, x_train, y_train, cv = 3)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.76294278, 0.79336966, 0.78156222])

# Lazy Classifier

In [47]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12
[0m

In [48]:
from sklearn.model_selection import train_test_split
x_a , x_b = train_test_split(x_train,test_size = 0.2 , random_state=42)
y_a , y_b = train_test_split(y_train,test_size = 0.2 , random_state=42)


In [49]:
from lazypredict.Supervised import LazyClassifier

lazy_clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = lazy_clf.fit(x_a, x_b, y_a, y_b)
models

100%|██████████| 29/29 [00:11<00:00,  2.43it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.82,0.82,0.82,0.82,0.19
SVC,0.82,0.82,0.82,0.82,1.26
RandomForestClassifier,0.81,0.81,0.81,0.81,0.86
XGBClassifier,0.81,0.81,0.81,0.81,0.75
ExtraTreesClassifier,0.81,0.81,0.81,0.81,0.7
NuSVC,0.81,0.81,0.81,0.81,1.67
AdaBoostClassifier,0.8,0.8,0.8,0.8,0.33
BaggingClassifier,0.8,0.8,0.8,0.8,0.23
KNeighborsClassifier,0.79,0.79,0.79,0.79,0.14
SGDClassifier,0.79,0.79,0.79,0.79,0.12


### Ensemble

In [50]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import GammaRegressor

estimators = [
     ('SVC', svc),
     ('XGBClasifier' , xgb_clf),
     ('LGBMClassifier' , lgbm_clf),
      ('forest' , rnd_clf)
    ]

ensemble = StackingClassifier(estimators=estimators,final_estimator=lgbm_clf)
ensemble.fit(x_train, y_train)

StackingClassifier(estimators=[('SVC', SVC()),
                               ('XGBClasifier',
                                XGBClassifier(base_score=0.5, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=1,
                                              colsample_bynode=1,
                                              colsample_bytree=1,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric=None, gamma=0,
                                              gpu_id=-1,
                                              grow_policy='depthwise',
                                              importance_type=None,
                                              interaction_constraints='',
                                              learning_ra...
              

In [51]:
cross_val = cross_val_score(ensemble, x_train, y_train, cv=3)
cross_val

array([0.75249773, 0.77929155, 0.77565849])

# DNN

In [52]:
from sklearn.model_selection import train_test_split
x_tr, x_ts , y_tr, y_ts = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [53]:
x_tr, x_val , y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.2, random_state=42)

In [54]:
x_tr.shape

(4227, 13)

In [55]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [114]:
model = keras.Sequential(
    [
        layers.Flatten(input_shape=(13,), name="Input"),
        layers.Dense(120, activation="relu", name="layer1"),
        layers.Dense(100, activation="relu", name="layer2"),
        layers.Dense(10, activation="relu", name="layer3"),
        layers.Dense(1, activation="sigmoid" ,name="Output"),
    ]
)

In [57]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (Flatten)              (None, 13)                0         
_________________________________________________________________
layer1 (Dense)               (None, 180)               2520      
_________________________________________________________________
layer2 (Dense)               (None, 180)               32580     
_________________________________________________________________
layer3 (Dense)               (None, 180)               32580     
_________________________________________________________________
layer4 (Dense)               (None, 180)               32580     
_________________________________________________________________
layer5 (Dense)               (None, 10)                1810      
_________________________________________________________________
Output (Dense)               (None, 1)                 1

In [115]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [116]:
his = model.fit(x_tr,y_tr , epochs=50, batch_size=32, validation_data=(x_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [60]:
#his.history

In [117]:
results = model.evaluate(x_val, y_val, batch_size=128) #loss: 0.4864 - accuracy: 0.7796



In [62]:
results

[0.5064213871955872, 0.7436140179634094]

In [63]:
y_ts_pred = model.predict(x_ts)

In [64]:
y_ts_pred 

array([[0.8712499 ],
       [0.7805027 ],
       [0.75544775],
       ...,
       [0.8125715 ],
       [0.9214806 ],
       [0.5705855 ]], dtype=float32)

In [65]:
for i in range(len(y_ts_pred)):
    if y_ts_pred[i]<0.5:
        y_ts_pred[i]=0
    else:
        y_ts_pred[i]=1

In [66]:
y_ts_pred

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

In [67]:
from sklearn.metrics import classification_report
#classification_report(y_ts, y_ts_pred)
confusion_matrix(y_ts, y_ts_pred)        #([[517, 136],      ([[517, 136],
                                          # [142, 527]])       [142, 527]])

array([[384, 269],
       [ 66, 603]])

In [68]:
y_ts

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

# Test model

In [69]:
test_data_original = pd.read_csv('../input/spaceship-titanic/test.csv')

In [70]:
test_data = test_data_original.copy()
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [71]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [72]:
test_data = splitCabinFeature(test_data)
test_data.drop(['PassengerId' , 'Name'],axis = 1,inplace=True)

In [73]:
enc_oder = OrdinalEncoder()

for col in ll:
    test_data[col] = enc_oder.fit_transform(np.array(test_data[col]).reshape(-1,1))
    

In [74]:
from sklearn.impute import SimpleImputer

imp1 = SimpleImputer(missing_values=np.nan, strategy='median')
imp2 = SimpleImputer(missing_values=np.nan, strategy='median')

In [75]:
def imputerT(x):
  for col in x.select_dtypes(include=['float64' , 'int64']).columns:
    x[col] = imp1.fit_transform(np.array(x[col]).reshape(-1,1))

  for col in x.select_dtypes(include=['object']).columns:
    x[col] = imp2.fit_transform(np.array(x[col]).reshape(-1,1))

  return x

In [76]:
test_data = imputerT(test_data)

In [77]:
test_data_edit = test_data.copy()

In [118]:
test_predicts = model.predict(test_data)

In [119]:
for i in range(len(test_predicts)):
    if test_predicts[i]<0.5:
        test_predicts[i]=0
    else:
        test_predicts[i]=1

In [120]:
final = pd.DataFrame()
final['PassengerId'] = test_data_original['PassengerId']
final['Transported'] = test_predicts

In [121]:
final['Transported'] = final['Transported'].map({0:'False' , 1:'True'})
final.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False


In [122]:
final.to_csv( r'final_8_dnn.csv', index = False)

# Best Accuracy = 0.74444 (on DNN model)