# Welcome To My SpaceShip Titanic Ai Model

In this model, the problem will be tried to be solved with the random forest model on the binary classification problem.

## Data importing

In [439]:
import pandas as pd
import numpy as np

In [440]:
df_train = pd.read_csv("../../database/spceship_titanic/train.csv")
df_test = pd.read_csv("../../database/spceship_titanic/test.csv")

## Data Review

In [441]:
df_train.shape

(8693, 14)

In [442]:
df_test.shape

(4277, 13)

Let's make a copy of the file to use the PassengerId property later.

In [443]:
df_test_original = df_test.copy()

In [444]:
df_train.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

## Processing Data

### Preprocesssing PassengerId column

In [445]:
df_train['PassengerGroup'] = df_train['PassengerId'].str.split("_").str[0]
df_test['PassengerGroup'] = df_test['PassengerId'].str.split("_").str[0]

df_train['PassengerGroup'] = df_train['PassengerGroup'].astype(int)
df_test['PassengerGroup'] = df_test['PassengerGroup'].astype(int)

df_train = df_train.drop(["PassengerId","Name"],axis=1)
df_test = df_test.drop(["PassengerId","Name"],axis=1)

In [446]:
df_train.isnull().sum()

HomePlanet        201
CryoSleep         217
Cabin             199
Destination       182
Age               179
VIP               203
RoomService       181
FoodCourt         183
ShoppingMall      208
Spa               183
VRDeck            188
Transported         0
PassengerGroup      0
dtype: int64

### Preprocessing VIP and CryoSleep columns

In [447]:
most_common_CryoSleep = df_train['CryoSleep'].mode()[0]
most_common_VIP = df_train['VIP'].mode()[0]

df_train['CryoSleep'].fillna(most_common_CryoSleep, inplace=True)
df_train['VIP'].fillna(most_common_VIP, inplace=True)

In [448]:
most_common_CryoSleep_test = df_test['CryoSleep'].mode()[0]
most_common_VIP_test = df_test['VIP'].mode()[0]

df_test['CryoSleep'].fillna(most_common_CryoSleep_test, inplace=True)
df_test['VIP'].fillna(most_common_VIP_test, inplace=True)

In [449]:
df_train.isnull().sum()

HomePlanet        201
CryoSleep           0
Cabin             199
Destination       182
Age               179
VIP                 0
RoomService       181
FoodCourt         183
ShoppingMall      208
Spa               183
VRDeck            188
Transported         0
PassengerGroup      0
dtype: int64

### Preprocessing Cabin column

In [450]:
df_train['Cabin'].fillna('U/0/U', inplace=True)
df_test['Cabin'].fillna('U/0/U', inplace=True)

df_train[['Deck','Num','Side']] = df_train.Cabin.str.split("/",expand=True)
df_test[['Deck','Num','Side']] = df_test.Cabin.str.split("/",expand=True)

df_train.drop('Cabin', axis=1, inplace=True)
df_test.drop('Cabin', axis=1, inplace=True)

In [451]:
df_train['Num'] = pd.to_numeric(df_train['Num'])
df_test['Num'] = pd.to_numeric(df_test['Num'])

In [452]:
df_train.isnull().sum()

HomePlanet        201
CryoSleep           0
Destination       182
Age               179
VIP                 0
RoomService       181
FoodCourt         183
ShoppingMall      208
Spa               183
VRDeck            188
Transported         0
PassengerGroup      0
Deck                0
Num                 0
Side                0
dtype: int64

In [453]:
df_train.Deck.unique()

array(['B', 'F', 'A', 'G', 'U', 'E', 'D', 'C', 'T'], dtype=object)

In [454]:
df_train.Side.unique()

array(['P', 'S', 'U'], dtype=object)

### Preprocessing Spend Columns and Age Column

In [455]:
spend_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in spend_columns:
    df_train[col].fillna(0, inplace=True)
    df_test[col].fillna(0, inplace=True)

In [456]:
df_train['TotalSpend'] = df_train[spend_columns].sum(axis=1)
df_test['TotalSpend'] = df_test[spend_columns].sum(axis=1)

In [457]:
for col in spend_columns:
    df_train[col] = df_train[col] / df_train['TotalSpend'] * 100
    df_test[col] = df_test[col] / df_test['TotalSpend'] * 100

In [458]:
df_train[spend_columns] = df_train[spend_columns].fillna(0)
df_test[spend_columns] = df_test[spend_columns].fillna(0)

In [459]:
df_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,Deck,Num,Side,TotalSpend
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,B,0,P,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,14.809783,1.222826,3.396739,74.592391,5.978261,True,2,F,0,S,736.0
2,Europa,False,TRAPPIST-1e,58.0,True,0.414138,34.440913,0.0,64.673023,0.471925,False,3,A,0,S,10383.0
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,24.787481,7.167697,64.316074,3.728748,False,3,A,0,S,5176.0
4,Earth,False,TRAPPIST-1e,16.0,False,27.772686,6.416132,13.840513,51.787351,0.183318,True,4,F,1,S,1091.0


In [460]:
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].mean())

In [461]:
bins = [0, 18, 35, 60, np.inf]  
labels = ['Child', 'Young_Adult', 'Adult', 'Senior']

df_train['AgeGroup'] = pd.cut(df_train['Age'], bins=bins, labels=labels)
df_train = df_train.drop('Age', axis=1)

df_test['AgeGroup'] = pd.cut(df_test['Age'], bins=bins, labels=labels)
df_test = df_test.drop('Age', axis=1)

In [462]:
df_train.isnull().sum()

HomePlanet        201
CryoSleep           0
Destination       182
VIP                 0
RoomService         0
FoodCourt           0
ShoppingMall        0
Spa                 0
VRDeck              0
Transported         0
PassengerGroup      0
Deck                0
Num                 0
Side                0
TotalSpend          0
AgeGroup          178
dtype: int64

In [463]:
df_train.dtypes

HomePlanet          object
CryoSleep             bool
Destination         object
VIP                   bool
RoomService        float64
FoodCourt          float64
ShoppingMall       float64
Spa                float64
VRDeck             float64
Transported           bool
PassengerGroup       int32
Deck                object
Num                  int64
Side                object
TotalSpend         float64
AgeGroup          category
dtype: object

### Feature Engineering for VIP and CryoSleep

In [464]:
df_train['VIP_or_Cryo'] = df_train['VIP'] | df_train['CryoSleep']
df_test['VIP_or_Cryo'] = df_test['VIP'] | df_test['CryoSleep']

In [465]:
df_train['VIP_and_Cryo'] = df_train['VIP'] & df_train['CryoSleep']
df_test['VIP_and_Cryo'] = df_test['VIP'] & df_test['CryoSleep']

In [466]:
bool_cols = ['CryoSleep', 'VIP','VIP_and_Cryo','VIP_or_Cryo']

for col in bool_cols:
    df_train[col] = df_train[col].astype(int)
    df_test[col] = df_test[col].astype(int)

In [467]:
df_train[ 'Transported'] = df_train[ 'Transported'].astype(int)

### One-Hot Coding

In [468]:
df_train.columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported',
       'PassengerGroup', 'Deck', 'Num', 'Side', 'TotalSpend', 'AgeGroup',
       'VIP_or_Cryo', 'VIP_and_Cryo'],
      dtype='object')

In [469]:
categorical_cols = ['AgeGroup','Deck', 'Side', 'HomePlanet', 'Destination']

df_train = pd.get_dummies(df_train, columns=categorical_cols,dummy_na= True)
df_test = pd.get_dummies(df_test, columns=categorical_cols, dummy_na= True)

In [470]:
df_train.shape

(8693, 40)

In [471]:
df_train.columns

Index(['CryoSleep', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa',
       'VRDeck', 'Transported', 'PassengerGroup', 'Num', 'TotalSpend',
       'VIP_or_Cryo', 'VIP_and_Cryo', 'AgeGroup_Child', 'AgeGroup_Young_Adult',
       'AgeGroup_Adult', 'AgeGroup_Senior', 'AgeGroup_nan', 'Deck_A', 'Deck_B',
       'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_U',
       'Deck_nan', 'Side_P', 'Side_S', 'Side_U', 'Side_nan',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_nan', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'Destination_nan'],
      dtype='object')

In [472]:
df_test.isnull().sum()

CryoSleep                    0
VIP                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
PassengerGroup               0
Num                          0
TotalSpend                   0
VIP_or_Cryo                  0
VIP_and_Cryo                 0
AgeGroup_Child               0
AgeGroup_Young_Adult         0
AgeGroup_Adult               0
AgeGroup_Senior              0
AgeGroup_nan                 0
Deck_A                       0
Deck_B                       0
Deck_C                       0
Deck_D                       0
Deck_E                       0
Deck_F                       0
Deck_G                       0
Deck_T                       0
Deck_U                       0
Deck_nan                     0
Side_P                       0
Side_S                       0
Side_U                       0
Side_nan                     0
HomePlanet_Earth             0
HomePlan

### Data scaling

In [473]:
X_scale = df_train.drop('Transported', axis=1)
y_scale = df_train['Transported']

In [474]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()

In [475]:
X_scale = scaler.fit_transform(X_scale)
X_last_test = scaler.transform(df_test)

In [476]:
X_train, X_val, y_train, y_val = train_test_split(X_scale, y_scale, test_size=0.2, random_state=42)

## Model Selection and Training

In [477]:
# from sklearn.model_selection import cross_val_score
# from sklearn.linear_model import LogisticRegression

In [478]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1,l2
from tensorflow.keras.callbacks import EarlyStopping


In [479]:
#model = LogisticRegression(max_iter= 1000000,C=10000,solver="liblinear",penalty="l1",class_weight='balanced')

In [480]:
#scores = cross_val_score(model, X_train, y_train, cv=5)

In [481]:
#print("Ortalama çapraz doğrulama skoru: ", scores.mean())

Random forest Cross Val Score : 0.7746510533744576

Logistic Regresyon Cross Val Score : 0.7879908628360015


In [482]:
model = Sequential()

In [483]:
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu',kernel_regularizer=l2(0.02)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))  # Dropout layer
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu',kernel_regularizer=l2(0.02)))
model.add(Dropout(0.1))  # Dropout layer
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu',kernel_regularizer=l2(0.02)))
model.add(Dropout(0.1))  # Dropout layer
model.add(Dense(2, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [484]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [485]:
early_stop = EarlyStopping(monitor='val_loss', patience=5)

In [486]:
model.fit(X_train, y_train, epochs=60, batch_size=32,shuffle=True, validation_data=(X_val,y_val)) #callbacks=[early_stop]

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x26a825e7010>

In [487]:
loss, accuracy = model.evaluate(X_train, y_train)
print("Test setindeki kayıp: ", loss)
print("Test setindeki doğruluk: ", accuracy)

Test setindeki kayıp:  0.4125412404537201
Test setindeki doğruluk:  0.8202473521232605


In [488]:
loss, accuracy = model.evaluate(X_val, y_val)
print("Test setindeki kayıp: ", loss)
print("Test setindeki doğruluk: ", accuracy)

Test setindeki kayıp:  0.48168593645095825
Test setindeki doğruluk:  0.7837837934494019


In [489]:
predicts = model.predict(X_last_test)



In [490]:
predicts = predicts > 0.5

In [491]:
output = pd.DataFrame({'PassengerId': df_test_original.PassengerId, 'Transported': predicts.flatten()})
output['Transported'] = output['Transported'].map({True: 'True', False: 'False'})  
output.to_csv('neural_network_submission.csv', index=False)