In [48]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Data processing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import datetime
import math
import statistics
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from itertools import product
from math import sqrt
import json
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

#Models
from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense, LSTM, Dropout,Reshape
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import clone_model
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#Graphs
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import periodogram
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_pacf

In [49]:
np.random_seed=42

In [50]:
sample=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
test=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

In [51]:
#Lets first split our train dataset into the training portions and validation portions
X = train.drop(columns=['Transported'])
y = train['Transported']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [52]:
# Shows number of rows in X_train
X_train.shape[0]

6085

In [53]:
#Next,lets have a look at our data

def data_sum(dataframe):
    nulls=[]
    count=[]
    unique=[]
    null_percentage=[]
    dtype=[]

    for col in dataframe.columns:
        nulls.append(dataframe[col].isnull().sum())
        count.append(dataframe[col].count())
        unique.append(dataframe[col].nunique())
        total_rows=dataframe.shape[0]
        null_percentage.append(dataframe[col].isnull().sum()*100/total_rows)
        dtype.append(dataframe[col].dtype)
    
    summary=pd.DataFrame({
        'Column': dataframe.columns,
        'Nulls': nulls,
        'Non-Null Count': count,
        'Unique Values': unique,
        'Null Percentage': null_percentage,
        'Data Type': dtype
    })

    return(summary)

X_train_summary=data_sum(X_train)
print(X_train_summary)

          Column  Nulls  Non-Null Count  Unique Values  Null Percentage  \
0    PassengerId      0            6085           6085         0.000000   
1     HomePlanet    140            5945              3         2.300740   
2      CryoSleep    154            5931              2         2.530813   
3          Cabin    142            5943           4860         2.333607   
4    Destination    122            5963              3         2.004930   
5            Age    129            5956             80         2.119967   
6            VIP    153            5932              2         2.514380   
7    RoomService    114            5971           1026         1.873459   
8      FoodCourt    122            5963           1216         2.004930   
9   ShoppingMall    146            5939            891         2.399343   
10           Spa    122            5963           1077         2.004930   
11        VRDeck    129            5956           1050         2.119967   
12          Name    138  

In [54]:
#Lets first drop some colums we wont be using
X_train=X_train.drop(columns=['PassengerId','Name'])

In [55]:
# X_train['TotalSpending'] = X_train[['FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].sum()

In [56]:
X_train

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
3032,Europa,False,B/120/S,TRAPPIST-1e,43.0,False,0.0,1440.0,0.0,85.0,150.0
7757,Europa,True,C/273/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0
1795,Earth,False,G/300/S,TRAPPIST-1e,46.0,False,8.0,652.0,0.0,5.0,90.0
1702,Earth,False,F/346/S,TRAPPIST-1e,33.0,False,0.0,763.0,8.0,2.0,30.0
6634,Earth,False,F/1334/S,55 Cancri e,24.0,False,0.0,58.0,618.0,0.0,41.0
...,...,...,...,...,...,...,...,...,...,...,...
5734,Earth,,G/988/S,TRAPPIST-1e,18.0,False,14.0,2.0,144.0,610.0,0.0
5191,Mars,False,F/1063/S,TRAPPIST-1e,50.0,,690.0,0.0,30.0,762.0,428.0
5390,Earth,False,F/1194/P,PSO J318.5-22,22.0,False,158.0,0.0,476.0,0.0,26.0
860,Mars,False,F/191/P,TRAPPIST-1e,34.0,False,379.0,0.0,1626.0,0.0,0.0


In [57]:
X_train.corr()

  X_train.corr()


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
Age,1.0,0.063455,0.13177,0.03598,0.127165,0.100089
RoomService,0.063455,1.0,-0.016009,0.042314,0.003144,-0.018663
FoodCourt,0.13177,-0.016009,1.0,-0.008833,0.20927,0.242087
ShoppingMall,0.03598,0.042314,-0.008833,1.0,0.022235,0.002684
Spa,0.127165,0.003144,0.20927,0.022235,1.0,0.173287
VRDeck,0.100089,-0.018663,0.242087,0.002684,0.173287,1.0


First lets split into deck num side 

In [58]:
X_train[["Deck","Num","Side"]]=X_train["Cabin"].str.split("/",expand = True)

In [59]:
X_train.drop(columns = ['Cabin'], inplace = True)

In [60]:
#Now im trying to impute using interpolation
# Age & Amenities has a relationship. The older, the more spending.
X_train[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']] = X_train[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].interpolate(method='linear')

In [61]:
#lets see how the dataframe is like now
X_train_2_summary=data_sum(X_train)
X_train_2_summary

Unnamed: 0,Column,Nulls,Non-Null Count,Unique Values,Null Percentage,Data Type
0,HomePlanet,140,5945,3,2.30074,object
1,CryoSleep,154,5931,2,2.530813,object
2,Destination,122,5963,3,2.00493,object
3,Age,0,6085,114,0.0,float64
4,VIP,153,5932,2,2.51438,object
5,RoomService,0,6085,1060,0.0,float64
6,FoodCourt,0,6085,1266,0.0,float64
7,ShoppingMall,0,6085,938,0.0,float64
8,Spa,0,6085,1114,0.0,float64
9,VRDeck,0,6085,1101,0.0,float64


Looks like we have completed the imputing for numerical features now lets work on the categorical features.



<!--  --> from sklearn.impute import SimpleImputer. Constant strategy is better for one-hot encoding


In [62]:
categorical = X_train.dtypes == 'object'
categorical

HomePlanet       True
CryoSleep        True
Destination      True
Age             False
VIP              True
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Deck             True
Num              True
Side             True
dtype: bool

Look at the columns with categorical data.
Starting from HomePlanet.

Aus help to put some imputation for Arden
Arden - Above
Austin - Below

In [63]:
#Created a new feature
X_train["TotalSpending"]=X_train[["RoomService","Spa","FoodCourt","VRDeck",'ShoppingMall']].sum(axis=1)

Lets impute HomePlanet

In [64]:
#For rows where ["Destination"]=="PSO J318.5-22",let the rows with missing values in ["HomePlanet"] be imputed with "Earth"
X_train.loc[X_train["Destination"]=="PSO J318.5-22","HomePlanet"] = X_train.loc[X_train["Destination"]=="PSO J318.5-22","HomePlanet"].fillna("Earth")
#For rows where ["Destination"]=="55 Cancri e",let the rows with missing values in ["HomePlanet"] be imputed with "Europa"
X_train.loc[X_train["Destination"]=="55 Cancri e","HomePlanet"]=X_train.loc[X_train["Destination"]=="55 Cancri e","HomePlanet"].fillna("Europa")
#For rows where ["Destination"]=="TRAPPIST-1e",let the rows with missing values in ["HomePlanet"] be imputed with "Earth"
X_train.loc[X_train["Destination"]=="TRAPPIST-1e","HomePlanet"]=X_train.loc[X_train["Destination"]=="TRAPPIST-1e","HomePlanet"].fillna("Earth")
X_train.loc[X_train['Deck']=='A','HomePlanet']=X_train.loc[X_train['Deck']=='A','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='B','HomePlanet']=X_train.loc[X_train['Deck']=='B','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='C','HomePlanet']=X_train.loc[X_train['Deck']=='C','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='T','HomePlanet']=X_train.loc[X_train['Deck']=='T','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='G','HomePlanet']=X_train.loc[X_train['Deck']=='G','HomePlanet'].fillna("Earth")
X_train.loc[X_train['Destination'] == 'TRAPPIST-1e', 'HomePlanet'] = X_train.loc[X_train['Destination'] == 'TRAPPIST-1e', 'HomePlanet'].fillna('Earth')
X_train['HomePlanet']=X_train['HomePlanet'].fillna('Earth')

Lets impute CryoSleep

In [65]:
X_train.loc[X_train["TotalSpending"]>=5000,"CryoSleep"]=X_train.loc[X_train["TotalSpending"]>=5000,"CryoSleep"].fillna(False)
X_train.loc[X_train["TotalSpending"]==0,"CryoSleep"]=X_train.loc[X_train["TotalSpending"]==0,"CryoSleep"].fillna(True)
X_train["CryoSleep"]=X_train["CryoSleep"].fillna(False)

Lets impute Destination

In [66]:
X_train.loc[X_train['VIP'] == 'FALSE','Destination'] = X_train.loc[X_train['VIP'] == 'FALSE','Destination'].fillna('TRAPPIST-1e')
X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'].fillna(method = 'ffill')
X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'].fillna(method = 'bfill')
X_train.loc[X_train['HomePlanet'] == 'Europa', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Europa', 'Destination'].fillna('55 Cancri e')
X_train.loc[X_train['HomePlanet'] == 'Mars', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Mars', 'Destination'].fillna('TRAPPIST-1e')
X_train.loc[X_train['HomePlanet'] == 'Earth','Destination'] = X_train.loc[X_train['HomePlanet'] == 'Earth','Destination'].fillna('TRAPPIST-1e')
X_train.loc[X_train['VIP'] == False, 'Destination'] = X_train.loc[X_train['VIP'] == False, 'Destination'].fillna('TRAPPIST-1e')
X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'] = X_train.loc[X_train['HomePlanet'] == 'Earth', 'Destination'].fillna('TRAPPIST-1e')

Lets impute VIP

In [67]:
X_train.loc[X_train['TotalSpending'] >= 5000, 'VIP'] = X_train.loc[X_train['TotalSpending'] >= 5000, 'VIP'].fillna(True)
X_train.loc[X_train['TotalSpending'] < 5000, 'VIP'] = X_train.loc[X_train['TotalSpending'] < 5000, 'VIP'].fillna(False)

In [68]:
#Deck has a uneven distribution with deck G and F having a far larger value then the other decks. Lets just impute all nulls  to be F
X_train['Deck']=X_train['Deck'].fillna('F')

In [69]:
print(f"There are initially {X_train['Num'].isna().sum()} nulls")
#Num has too much unique values and we cant do simple imputation because each unique value has such a low count that setting all 72 nulls to that value will skew the results so we just randomly impute the nulls using the existing values 
num_values = X_train['Num'].dropna().values
# Shuffle non-missing values multiple times
num_shuffled = num_values.copy()  # Make a copy to shuffle multiple times
np.random.shuffle(num_shuffled)
num_shuffled=pd.Series(num_shuffled)
# Fill missing values with the shuffled values
X_train['Num']=X_train['Num'].fillna(num_shuffled)
print(f" Now there are {X_train['Num'].isna().sum()} nulls")
#for some reason the number of nulls dropped to 34 but wont drop further, im gonna impute the last 34 nulls with the most frequent value for now until I can solve why
X_train['Num']=X_train['Num'].fillna(82)
    



There are initially 142 nulls
 Now there are 34 nulls


In [70]:
#For Side, they are almost equal and rightfully so, lets just impute the nulls equally
for value in ['S','P']:
    null_indices = X_train[X_train['Side'].isnull()].sample(71).index
    X_train.loc[null_indices, 'Side'] = value

In [71]:
X_train_3_summary=data_sum(X_train)
X_train_3_summary

Unnamed: 0,Column,Nulls,Non-Null Count,Unique Values,Null Percentage,Data Type
0,HomePlanet,0,6085,3,0.0,object
1,CryoSleep,0,6085,2,0.0,bool
2,Destination,0,6085,3,0.0,object
3,Age,0,6085,114,0.0,float64
4,VIP,0,6085,2,0.0,object
5,RoomService,0,6085,1060,0.0,float64
6,FoodCourt,0,6085,1266,0.0,float64
7,ShoppingMall,0,6085,938,0.0,float64
8,Spa,0,6085,1114,0.0,float64
9,VRDeck,0,6085,1101,0.0,float64


Imputation complete
<br>
Now lets encode
<br>
Cardinality:
<br>
Low: 2-5
<br>
Moderate: 6-46
<br>
High: 50 and above

HomePlanet:Label Encoding
<br>
CryoSleep:Label Encoding
<br>
Destination:Label Encoding
<br>
VIP:Label Encoding
<br>
Deck:OHE
<br>
Num:Converted to float then Standardization
<br>
Side:Label Encoding
<br>
Remaining numerical features:Standardization

In [72]:
#Convert ['Num'] to numerical feature
X_train['Num'] = X_train['Num'].astype(float)

In [74]:
l_c_c_f=["HomePlanet", "CryoSleep", "Destination", "VIP", "Side"]
m_c_c_f=['Deck']
n_f=X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()



# Create transformers for categorical features
l_c_categorical_transformer = Pipeline(steps=[
    ('encoder', LabelEncoder())  
])

m_c_categorical_transformer = Pipeline(steps=[
    ('encoder',OneHotEncoder(handle_unknown='ignore'))  
])


# Create transformers for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('l_c_cat', l_c_categorical_transformer, l_c_c_f),
        ('num', numerical_transformer, n_f),
        ('m_c_cat', m_c_categorical_transformer, m_c_c_f)  
    ])

# Create the preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply the preprocessing pipeline to your data
encoded_data = preprocessing_pipeline.fit_transform(X_train)

# Convert the transformed data back to a DataFrame
X_train_encoded = pd.DataFrame(encoded_data, columns=preprocessor.get_feature_names_out(X_train.columns))


TypeError: LabelEncoder.fit_transform() takes 2 positional arguments but 3 were given