# Welcome To My SpaceShip Titanic Ai Model

In this model, the problem will be tried to be solved with the xgboosted algorithm using gridsearch on the binary classification problem.

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_csv("../../database/spceship_titanic/train.csv")
df_test = pd.read_csv("../../database/spceship_titanic/test.csv")


## Data Review

In [4]:
df_train.shape

(8693, 14)

In [5]:
df_test.shape

(4277, 13)

Let's make a copy of the file to use the PassengerId property later.

In [6]:
df_test_original = df_test.copy()

In [7]:
df_train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [8]:
df_test.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [9]:
df_train.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

Converting bool columns to numeric columns. Beware of null values.

In [10]:
df_train["Transported"] = df_train["Transported"].astype(int)

df_train["CryoSleep"] = df_train["CryoSleep"].fillna(-1).astype(int)
df_train["VIP"] = df_train["VIP"].fillna(-1).astype(int)
df_test["CryoSleep"] = df_test["CryoSleep"].fillna(-1).astype(int)
df_test["VIP"] = df_test["VIP"].fillna(-1).astype(int)

# Bu adımdan sonra -1 olanları tekrar NaN'a çevirebiliriz
df_train["CryoSleep"].replace({-1: np.nan}, inplace=True)
df_train["VIP"].replace({-1: np.nan}, inplace=True)
df_test["CryoSleep"].replace({-1: np.nan}, inplace=True)
df_test["VIP"].replace({-1: np.nan}, inplace=True)

In [11]:
df_train = df_train.drop(["PassengerId","Name"],axis=1)
df_test = df_test.drop(["PassengerId","Name"],axis=1)

In [12]:
len(df_train["Cabin"].unique())

6561

By expanding the cabin feature, we will only use the knowledge of which sector the passengers sit in.

In [13]:
df_train.head()


Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,0.0,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,Earth,0.0,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1
2,Europa,0.0,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0
3,Europa,0.0,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0
4,Earth,0.0,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1


In [14]:
df_train['Cabin'] = df_train['Cabin'].fillna('Unknown')
df_train['Cabin'] = df_train['Cabin'].apply(lambda x: x[0])

df_test['Cabin'] = df_test['Cabin'].fillna('Unknown')
df_test['Cabin'] = df_test['Cabin'].apply(lambda x: x[0])

In [15]:
df_train["Cabin"].unique()

array(['B', 'F', 'A', 'G', 'U', 'E', 'D', 'C', 'T'], dtype=object)

In [16]:
df_train.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin             0
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

In [17]:
df_train.dtypes

HomePlanet       object
CryoSleep       float64
Cabin            object
Destination      object
Age             float64
VIP             float64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported       int32
dtype: object

In [18]:
categorical_cols = ['HomePlanet', 'Cabin', 'Destination']


In [19]:
def encode_and_drop(df, columns_to_encode):
    for column in columns_to_encode:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df


In [20]:
df_train = encode_and_drop(df_train, categorical_cols)
df_test = encode_and_drop(df_test, categorical_cols)

In [21]:
df_train.dtypes

CryoSleep                    float64
Age                          float64
VIP                          float64
RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
Transported                    int32
HomePlanet_Earth                bool
HomePlanet_Europa               bool
HomePlanet_Mars                 bool
Cabin_A                         bool
Cabin_B                         bool
Cabin_C                         bool
Cabin_D                         bool
Cabin_E                         bool
Cabin_F                         bool
Cabin_G                         bool
Cabin_T                         bool
Cabin_U                         bool
Destination_55 Cancri e         bool
Destination_PSO J318.5-22       bool
Destination_TRAPPIST-1e         bool
dtype: object

In [22]:
df_train.isnull().sum()

CryoSleep                    217
Age                          179
VIP                          203
RoomService                  181
FoodCourt                    183
ShoppingMall                 208
Spa                          183
VRDeck                       188
Transported                    0
HomePlanet_Earth               0
HomePlanet_Europa              0
HomePlanet_Mars                0
Cabin_A                        0
Cabin_B                        0
Cabin_C                        0
Cabin_D                        0
Cabin_E                        0
Cabin_F                        0
Cabin_G                        0
Cabin_T                        0
Cabin_U                        0
Destination_55 Cancri e        0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
dtype: int64

In [23]:
df_train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = df_train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
df_test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = df_test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)


In [24]:
df_train['TotalExpenditure'] = df_train['RoomService'] + df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']
df_test['TotalExpenditure'] = df_test['RoomService'] + df_test['FoodCourt'] + df_test['ShoppingMall'] + df_test['Spa'] + df_test['VRDeck']

In [25]:
df_train = df_train.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)
df_test = df_test.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)

In [26]:
df_train.isnull().sum()

CryoSleep                    217
Age                          179
VIP                          203
Transported                    0
HomePlanet_Earth               0
HomePlanet_Europa              0
HomePlanet_Mars                0
Cabin_A                        0
Cabin_B                        0
Cabin_C                        0
Cabin_D                        0
Cabin_E                        0
Cabin_F                        0
Cabin_G                        0
Cabin_T                        0
Cabin_U                        0
Destination_55 Cancri e        0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
TotalExpenditure               0
dtype: int64

In [27]:
df_test.isnull().sum()

CryoSleep                    93
Age                          91
VIP                          93
HomePlanet_Earth              0
HomePlanet_Europa             0
HomePlanet_Mars               0
Cabin_A                       0
Cabin_B                       0
Cabin_C                       0
Cabin_D                       0
Cabin_E                       0
Cabin_F                       0
Cabin_G                       0
Cabin_T                       0
Cabin_U                       0
Destination_55 Cancri e       0
Destination_PSO J318.5-22     0
Destination_TRAPPIST-1e       0
TotalExpenditure              0
dtype: int64

## Model Selection and Training

In [28]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

train_x = df_train.drop("Transported", axis=1)
train_y = df_train["Transported"]

In [29]:
xgb = XGBClassifier()

In [30]:
param_grid = {
    'n_estimators': [300],
    'learning_rate': [ 0.15],
    'max_depth': [5],
    'colsample_bytree': [0.8],
    'gamma': [ 1.0]
}

In [31]:
grid_search = GridSearchCV(estimator= xgb, param_grid= param_grid,cv= 3, scoring="accuracy",verbose=2,n_jobs=-1)

In [32]:
grid_search.fit(train_x,train_y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [33]:
print("En iyi parametreler: ", grid_search.best_params_)
print("En iyi skor: ", grid_search.best_score_)

En iyi parametreler:  {'colsample_bytree': 0.8, 'gamma': 1.0, 'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 300}
En iyi skor:  0.748189606836483


In [34]:
best_model= grid_search.best_estimator_

In [35]:
y_pred = best_model.predict(df_test)

In [36]:
output = pd.DataFrame({'PassengerId': df_test_original.PassengerId, 'Transported': y_pred.astype(bool)})


In [37]:
output.to_csv('submission.csv', index=False)
