In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from xgboost import plot_importance, plot_tree

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder



In [2]:
# Load data

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
train_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
#Count Nan values
train_df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [4]:
train_df['Transported'] = train_df['Transported'].astype('int')
test_df['Transported'] = -1
df = pd.concat([train_df, test_df], axis=0, sort=False)


In [5]:
class Preprocessor:
    def __init__(self):
        pass
        # self.encoder = {}


    def fit_transform(self, df):
        df = self.group_passenger(df)
        df = self.group_cabin(df)

        df.drop('Name', axis=1, inplace=True)

        df.set_index('PassengerId', inplace=True)

        df['Age'] = df['Age'].fillna(df['Age'].mean())
        for column in df.columns:
            if df[column].dtype == 'object' or df[column].dtype == 'bool':
                pass
            else:
                df[column] = df[column].fillna(0)

        df['Total_Spend'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
        
        # one_hot_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Passenger_Group', 'Cabin_Side', 'Transported']
        # for column in one_hot_columns:
        #     df[column] = OneHotEncoder().fit_transform(df[column].values.reshape(-1, 1)).toarray()

        for column in df.columns:

            # if (df[column].dtype == 'object' or df[column].dtype == 'bool'):
            if (df[column].dtype == 'object'):
                df[column] = df[column].astype('category')
            elif (df[column].dtype == 'bool'):
                # self.encoder[column] = LabelEncoder()
                df[column] = LabelEncoder().fit_transform(df[column])
        
        return df
    
    def group_passenger(self, df):
        df['Passenger_Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
        return df
    
    def group_cabin(self, df):
        cabin_df = df['Cabin'].str.split('/', expand=True)
        df['Cabin_Deck'] = cabin_df[0]
        df['Cabin_Number'] = cabin_df[1]
        df['Cabin_Side'] = cabin_df[2]
        
        # print(df[['Cabin', 'Cabin_Deck', 'Cabin_Number', 'Cabin_Side']])
        
        df.drop('Cabin', axis=1, inplace=True)
        
        return df

preprocessor = Preprocessor()


df = preprocessor.fit_transform(df)
train_df = df[df['Transported'] != -1]
test_df = df[df['Transported'] == -1]
test_df.drop('Transported', axis=1, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop('Transported', axis=1, inplace=True)


Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Passenger_Group,Cabin_Deck,Cabin_Number,Cabin_Side,Total_Spend
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0001_01,Europa,False,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,0,0001,B,0,P,0.0
0002_01,Earth,False,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,1,0002,F,0,S,736.0
0003_01,Europa,False,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,0,0003,A,0,S,10383.0
0003_02,Europa,False,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,0,0003,A,0,S,5176.0
0004_01,Earth,False,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,1,0004,F,1,S,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,Earth,True,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,-1,9266,G,1496,S,0.0
9269_01,Earth,False,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,-1,9269,,,,1018.0
9271_01,Mars,True,55 Cancri e,28.771969,False,0.0,0.0,0.0,0.0,0.0,-1,9271,D,296,P,0.0
9273_01,Europa,False,,28.771969,False,0.0,2680.0,0.0,0.0,523.0,-1,9273,D,297,P,3203.0


In [6]:
for column in df.columns:
    print(column, df[column].dtype)

HomePlanet category
CryoSleep category
Destination category
Age float64
VIP category
RoomService float64
FoodCourt float64
ShoppingMall float64
Spa float64
VRDeck float64
Transported int64
Passenger_Group category
Cabin_Deck category
Cabin_Number category
Cabin_Side category
Total_Spend float64


In [7]:
# Split the data into features and target
X = train_df.drop('Transported', axis=1)
y = train_df['Transported']

X_test = test_df

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
model = XGBClassifier(enable_categorical=True)

# Use grid search to find the best parameters
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'n_estimators': [50, 100, 150, 200, 500, 1000],
    'eta': [0.01, 0.05, 0.1]
}
grid = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)
model = grid.best_estimator_


# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_val = grid.predict(X_val)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_val, y_pred_val)
accuracy


Fitting 3 folds for each of 108 candidates, totalling 324 fits


1 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Projects\spaceship-titanic\env\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Projects\spaceship-titanic\env\Lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "d:\Projects\spaceship-titanic\env\Lib\site-packages\xgboost\sklearn.py", line 1519, in fit
    self._Booster = train(
                    ^^^^^^
  File "d:\Projects\spaceship-titanic\env\Lib\site-packages\xgboost\core.py", line 730, in inner_f
    r

{'eta': 0.05, 'max_depth': 6, 'n_estimators': 500}
0.8081679608858211


0.8016101207590569

In [8]:
y_pred = grid.predict(X_test)
output = pd.DataFrame({'PassengerId': X_test.index, 'Transported': y_pred})
output['Transported'] = output['Transported'].astype('bool')
output.to_csv('data/submission.csv', index=False)



In [9]:
len(output)

4277