In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# # Removing current submission files
# os.remove("/kaggle/working/submission6.csv")
# os.remove("/kaggle/working/submission.csv")
def done():
    print("DONE") 
# To check when processes are done

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# Loading Data

Firstly, let's start by loading the training and testing data. The test data will only have independent variables(i.e X) but the training data will have both dependent(y) and independent(X) variables. The training data contains a table with different features(independent variables) of previous passengers and whether they were transported or not. 

In [2]:
# Importing train and test data
X_train_full = pd.read_csv("../input/spaceship-titanic/train.csv")
X_test = pd.read_csv("../input/spaceship-titanic/test.csv")

# Drop the Name and PassengerId columns. As well as the other columns to prevent target leakage.
X_train_full.drop(["PassengerId", "Cabin", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Name"], axis=1, inplace=True)
X_test.drop(["Cabin", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Name"], axis=1, inplace=True)

# Removing rows who don't have a transported value from training data.
X_train_full.dropna(axis=0, subset=["Transported"], inplace=True)
y1 = X_train_full.Transported
X = X_train_full.drop(["Transported"],axis=1, inplace=False)

# Converting y from bool to int
y = y1.astype("int64")

# From training data, get a train_test split.
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Categorical and numerical data lists.
categorical = [column for column in X.columns if X[column].dtype == "object"]
numerical = [column for column in X.columns if X[column].dtype in ['int64', 'float64']]

done()

DONE


After the code above, we have 2 dataframes. y contains the output, that is for each row of data was this passanger tranported or not. x contains the inputs for each row minus the useless PassengerId and Name columns which were removed from both the training and testing data because they add no value to the model. 

After carefull consideration, I think the columns Cabin, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck will cause target leakage. This is because these are sevices that the passanger will use once on board. Hence at the point of data collection, there is no way to know that they are going to use the spa or the foodcourt, nor how many time they will use it, nor how much they will spend on these luxuries. Unless the passanger pays for the possibility of using these services as part of the voyage price, but it doesn't seem so since the values are not the same.


Then a test_train split is done on the model. This is done early to avoid train_test leakage eventually when we impute missing values.

In [3]:
#X.head()
X_test.head()
#print(y_train)
#print(categorical)
#print(numerical)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False
2,0019_01,Europa,True,55 Cancri e,31.0,False
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False


# Data Preprocessing

Now, checking the cardinality of the categorical columns to know if they are worth encoding. Since the data shrinks to 6 uselful columns, the cardinality threshold can be increased to about 20 without getting too large of a dataset.

In [4]:
# Cardinality of categorical data
cardinalities = []
for column in categorical:
    card = X[column].nunique()
    cardinalities.append((column, card))

ordinal_cols = [col for (col,card) in cardinalities if card==2]
onehot_cols = list(set(categorical) - set(ordinal_cols))

Now that the data has been prepared, it is time to preprocess it with a pipeline.
This pipeline called preprocess will:
* Step1: Take care of missing data using the Simple Imputer
* Step2: Take care of categorical data by encoding

Since the only numerical column is Age, it makes sense to impute 0 for the missing values. It is possible the person is a baby and the guardians didn't know how to input their age since they are probably months old by that point.

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Preprocessing for numerical and categorical data
numerical_imputer = SimpleImputer(strategy="constant", fill_value=0)
ordinal_encoder = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), 
                                 ('ordinal', OrdinalEncoder())])
onehot_encoder = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), 
                                 ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('numerical', numerical_imputer, numerical),
                                               ('ordinal', ordinal_encoder, ordinal_cols),
                                               ('onehot', onehot_encoder, onehot_cols)])


# Modeling and Trainging Model

*NOTE: All models used are present in the code, but are all commented out except for the one with the best performance*

Then the model is defined. Using a Regressor with gradient decent method using XGBoost.

In [6]:
# Creating XGBoost model 
from xgboost import XGBRegressor as model
my_model = model(n_estimators=1000, learning_rate=0.05)

# Creating final pipeline that uses the preprocessing steps and invokes the model
pipeline = Pipeline(steps=[('processor', preprocessor),
                           ('model', my_model)])

# Using the model with the Pipeline to train data and make predictions
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_valid)
predictions = [round(pred[i]) for i in range(len(pred))]

from sklearn.metrics import mean_absolute_error as mae
score = mae(predictions, y_valid)
print(score)

0.29672225416906267


Trying a different model

In [7]:
# from sklearn.linear_model import LogisticRegression
# logreg = LogisticRegression(solver='lbfgs')

# # Creating final pipeline that uses the preprocessing steps and invokes the model
# pipeline = Pipeline(steps=[('processor', preprocessor),
#                            ('model', logreg)])

# # Using the model with the Pipeline to train data and make predictions
# pipeline.fit(X_train, y_train)
# pred = pipeline.predict(X_valid)
# predictions = [round(pred[i]) for i in range(len(pred))]

# Evaluating Model

In [8]:
from sklearn.metrics import mean_absolute_error as mae
score = mae(predictions, y_valid)
print(score)

0.29672225416906267


# Generating Output Results for Submission

Training on Full data, then generating output file for submission

In [9]:
# Creating output for competition
pipeline.fit(X, y)
outputs = pipeline.predict(X_test).astype("int64").astype("bool")

output = pd.DataFrame({'PassengerId': X_test.PassengerId,
                       'Transported': outputs})
output.to_csv('submission.csv', index=False)