<a href="https://www.kaggle.com/code/pdenieves/feature-engineering-xgbclassifier?scriptVersionId=91790528" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Spaceship Titanic

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

import warnings 
warnings.simplefilter(action='ignore')
pd.options.mode.chained_assignment = None 


# 1. Data load

In [None]:
# Load train dataset
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv', sep=',', encoding='UTF-8')

print(f'Dimensions - Train {train_df.shape}')

train_df.head()

In [None]:
# Load test dataset
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv', sep=',', encoding='UTF-8')

print(f'Dimensions - Test {test_df.shape}')

test_df.head()

In [None]:
train_df['Transported'].describe()
train_df.groupby(['Transported'])['Transported'].count().head()

# 2. Review data

In [None]:
# For the data analysis and to simplify feature engineering, I unify both datasets.

test_df['Transported'] = 'Unknown'
combi_df = pd.concat([train_df, test_df])

print(f'Train dataset: {len(train_df)}')
print(f'Test dataset: {len(test_df)}')
print(f'Combined: {len(combi_df)}')

In [None]:
# Check missing values

combi_df.isna().sum()/len(combi_df)

# 3. Feature engineering

In [None]:
# Make a copy for feature engineering
combi_2 = combi_df.copy()

General logical values conversion:
   True --> 1
   False --> 0

### Feature: *PassengerId*

In [None]:
combi_2['PassengerId'].describe()

In [None]:
# Extract the number of members in a group and de group ID
combi_2['GroupID'] = combi_2['PassengerId'].apply(lambda x: x.split('_')[0])
combi_2['GroupNum'] = combi_2['PassengerId'].apply(lambda x: x.split('_')[1]).astype(int)

In [None]:
# Get de number of membres of each group
combi_2['GroupMembers'] = combi_2.groupby(['GroupID'])['GroupNum'].transform('max')

### Feature (new): *GroupMembers*

In [None]:
combi_2.groupby(['GroupMembers'])['GroupMembers'].count().sort_values(ascending = False).head(10)

In [None]:
# Let's see the relationship between GroupMembers and Transported
sns.countplot(x='GroupMembers', hue='Transported', palette='rocket', 
              data=combi_2[combi_2['Transported'] != 'Unknown'])

### Feature: *HomePlanet*

In [None]:
combi_2['HomePlanet'].describe()

In [None]:
combi_2.groupby(['HomePlanet'])['HomePlanet'].count().head()

In [None]:
# I keep the missing values (as 'Unknown')
combi_2['HomePlanet'].fillna('Unknown', inplace=True)

In [None]:
# Let's see the relationship between HomePlanet and Transported
sns.countplot(x='HomePlanet', hue='Transported', palette='rocket', 
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# Generate the dummy variables
combi_2 = pd.get_dummies(combi_2, columns=['HomePlanet'], drop_first=False)
combi_2.columns

### Feature: *CryoSleep*

In [None]:
combi_2['CryoSleep'].describe()

In [None]:
combi_2.groupby(['CryoSleep'])['CryoSleep'].count().head()

In [None]:
# Impute missing data with the most frequent value (False)
combi_2.loc[combi_2['CryoSleep'].isna(), 'CryoSleep'] = False

In [None]:
# Let's see the relationship between CryoSleep and Transported
sns.countplot(x='CryoSleep', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

# In case of CryoSleep=True, it's highly probably to be transported.

In [None]:
# and convert logic values to float
combi_2['CryoSleep'] = combi_2['CryoSleep'].replace({False:0, True:1})

### Feature: *Cabin*

In [None]:
combi_2['Cabin'].describe()

In [None]:
combi_2.groupby(['Cabin'])['Cabin'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute the missing data with a ficticious value ('X/0/X')
combi_2['Cabin'].fillna('X/0/X', inplace=True)

In [None]:
# Extract the Deck and the Side from the Cabin code.
combi_2['Deck'] = combi_2['Cabin'].apply(lambda x: str(x)[:1])
combi_2['Side'] = combi_2['Cabin'].apply(lambda x: str(x)[-1:])

# Get the number of passengers in the cabin
combi_2['CabinPassengers'] = combi_2.groupby(['Cabin'])['PassengerId'].transform('count')

### Feature (new): *Deck*

In [None]:
combi_2['Deck'].unique()

In [None]:
combi_2.groupby(['Deck'])['Deck'].count().head(20)

In [None]:
# Let's see the relationship between Deck and Transported
sns.countplot(x='Deck', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# Generate the dummy variables
combi_2 = pd.get_dummies(combi_2, columns=['Deck'], drop_first=False)
combi_2.columns

### Feature (new): *Side*

In [None]:
combi_2.groupby(['Side'])['Side'].count().head()

In [None]:
# Let's see the relationship between Side and Transported
sns.countplot(x='Side', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# Generate the dummy variables
combi_2 = pd.get_dummies(combi_2, columns=['Side'], drop_first=False)
combi_2.columns

### Feature (new): *CabinPassengers*

In [None]:
combi_2.groupby(['CabinPassengers'])['CabinPassengers'].count().head(10)

In [None]:
# In case the cabin 'X/0/X', the value should be 0
combi_2.loc[combi_2['Cabin'] == 'X/0/X', 'CabinPassengers'] = '0'

In [None]:
# Let's see the relationship between CabinPassengers and Transported
sns.countplot(x='CabinPassengers', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

### Feature: *Destination*

In [None]:
combi_2['Destination'].describe()

In [None]:
combi_2.groupby(['Destination'])['Destination'].count().sort_values(ascending = False).head()

In [None]:
# I keep the missing values (as 'Unknown')
combi_2['Destination'].fillna('Unknown', inplace=True)

In [None]:
# Let's see the relationship between Destination and Transported
sns.countplot(x='Destination', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# and generate the dummy variables
combi_2 = pd.get_dummies(combi_2, columns=['Destination'], drop_first=False)
combi_2.columns

### Feature: *Age*

In [None]:
combi_2['Age'].describe()

In [None]:
combi_2.groupby(['Age'])['Age'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute missing data with a fictocious value (0). There are others fields with that value.
combi_2['Age'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between Age and Transported
n_bins = int(combi_2['Age'].max())
sns.histplot(x='Age', hue='Transported', stat='density', bins=n_bins, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

# Kids are more likely to be transported

### Feature: *VIP*

In [None]:
combi_2['VIP'].describe()

In [None]:
combi_2.groupby(['VIP'])['VIP'].count().head()

In [None]:
# Impute missing data with the most frequent value (False)
combi_2.loc[combi_2['VIP'].isna(), 'VIP'] = False

In [None]:
# Let's see the relationship between VIP and Transported
sns.countplot(x='VIP', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# and convert logic values to float
combi_2['VIP'] = combi_2['VIP'].replace({False:0, True:1})

### Feature: *RoomService*

In [None]:
combi_2['RoomService'].describe()

In [None]:
combi_2.groupby(['RoomService'])['RoomService'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute missing data with the most frequent value.
combi_2['RoomService'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between RoomService and Transported
sns.histplot(x='RoomService', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

### Feature: *FoodCourt*

In [None]:
combi_2['FoodCourt'].describe()

In [None]:
combi_2.groupby(['FoodCourt'])['FoodCourt'].count().sort_values(ascending=False).head(10)

In [None]:
combi_2['RoomService'].isna().sum()

In [None]:
# Impute missing data with the most frequent value.
combi_2['FoodCourt'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between FoodCourt and Transported
sns.histplot(x='FoodCourt', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

### Feature: *ShoppingMall*

In [None]:
combi_2['ShoppingMall'].describe()

In [None]:
combi_2.groupby(['ShoppingMall'])['ShoppingMall'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute missing data with the most frequent value.
combi_2['ShoppingMall'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between ShoppingMall and Transported
sns.histplot(x='ShoppingMall', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

### Feature: *Spa*

In [None]:
combi_2['Spa'].describe()

In [None]:
combi_2.groupby(['Spa'])['Spa'].count().sort_values(ascending=False).head(10)

In [None]:
# Impute missing data with the most frequent value.
combi_2['Spa'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between Spa and Transported
sns.histplot(x='Spa', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

### Feature: *VRDeck*

In [None]:
combi_2['VRDeck'].describe()

In [None]:
combi_2.groupby(['VRDeck'])['VRDeck'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute missing data with the most frequent value.
combi_2['VRDeck'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between VRDeck and Transported
sns.histplot(x='VRDeck', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

### Feature: *Name*

In [None]:
combi_2['Name'].describe()

In [None]:
combi_2.groupby(['Name'])['Name'].count().sort_values(ascending = False).head(10)

In [None]:
combi_2['Name'].fillna(' Unknown', inplace=True)
combi_2['LastName'] = combi_2['Name'].apply(lambda x:x.split(" ")[1])

In [None]:
combi_2.groupby(['LastName'])['LastName'].count().sort_values(ascending = False).head(10)

In [None]:
# Get the number of members of every family (peoplpe with the same last name and in the same group)
combi_2['FamilyMembers'] = combi_2.groupby(['LastName', 'GroupID'])['PassengerId'].transform('count')

### Feature (new): *FamilyMembers*

In [None]:
combi_2.groupby(['FamilyMembers'])['FamilyMembers'].count().head(10)

In [None]:
# In case the LastName is 'Unknown', the value should be 0 (no relatives)
combi_2.loc[combi_2['LastName'] == 'Unknown', 'FamilyMembers'] = 0

In [None]:
# Let's see the relationship between FamilyMembers and Transported
sns.countplot(x='FamilyMembers', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

### Feature: *Transported*

In [None]:
combi_2['Transported'].unique()

In [None]:
# Convert logic values to integers
combi_2['Transported'] = combi_2['Transported'].replace({False:0, True:1, 'Unknown':2})


### Last checks

In [None]:
# Remove columns that will not use in my model
cols_to_drop = ['Cabin', 'Name', 'LastName', 'PassengerId', 'GroupNum', 'GroupID']
combi_3 = combi_2.drop(columns=cols_to_drop)

In [None]:
# Convert al values to float
object_cols = combi_3.select_dtypes('O').columns
combi_3[object_cols] = combi_3[object_cols].astype(int)

In [None]:
# Check missing values (if left)
combi_3.isna().sum()/len(combi_3)

In [None]:
# correlation
plt.figure(figsize=(15, 10))
sns.heatmap(combi_3.corr(),  vmin=-1, vmax=1, cmap='BrBG')

In [None]:
combi_3

# 4. Preparation of the training and test datasets

In [None]:
combi_final = combi_3.copy()

# Split train and validation datasets
train_final = combi_final[combi_final['Transported'] != 2]

X_train_pred = train_final.loc[:, train_final.columns != 'Transported']
y_train_pred = train_final['Transported']

train_set, validation_set = train_test_split(train_final, test_size=0.2, random_state=101)  
X_train = train_set.loc[:, train_set.columns != 'Transported']
y_train = train_set['Transported'] 

X_validation = validation_set.loc[:, validation_set.columns != 'Transported']
y_validation = validation_set['Transported']

# Test dataset
test_final = combi_final[combi_final['Transported'] == 2]
test_final.drop('Transported', axis=1, inplace=True)
X_test = test_final

print(len(combi_final), "combi_final")
print(len(X_train), "X_train")
print(len(X_validation), "X_validation")
print(len(X_test), "X_test")

# 5. Training the model

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
num_f = len(X_train.columns)
print(f'# features: {num_f}')

In [None]:
classifier = XGBClassifier(n_estimators=360,
                           max_depth=3,
                           objective='binary:hinge',
                           subsample=0.6,
                           colsample_bytree=0.5,
                           eta=0.1,
                           eval_metric='error',
                           use_label_encoder=False,
                           n_jobs=-1,
                           random_state=101)

model = classifier.fit(X_train, y_train)

predicted_train = model.predict(X_train)
predicted_validation = model.predict(X_validation)

print(f"accuracy_score for train = {accuracy_score(y_train, predicted_train)}")
print(f"accuracy_score for validation = {accuracy_score(y_validation, predicted_validation)}")

# 6. Make prediction

In [None]:
# Train again, with the whole training dataset (train + validation)
model = classifier.fit(X_train_pred, y_train_pred)

In [None]:
prediction = pd.DataFrame(index=test_df.index)
prediction['PassengerId'] = test_df['PassengerId']
prediction['Transported'] = model.predict(X_test)

prediction['Transported'] = prediction['Transported'].replace({0:False, 1:True})

In [None]:
prediction.to_csv('submission.csv', sep=',', index=False)
prediction.head()