In [23]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Data processing
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import datetime
import math
from sklearn.preprocessing import LabelEncoder
import statistics
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from itertools import product
from math import sqrt
import json
from sklearn.preprocessing import PolynomialFeatures

#Models
from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense, LSTM, Dropout,Reshape
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import clone_model
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#Graphs
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.signal import periodogram
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_pacf

In [24]:
sample=pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
test=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

In [25]:
#Lets first split our train dataset into the training portions and validation portions
X = train.drop(columns=['Transported'])
y = train['Transported']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
X_train.shape[0]

6085

In [27]:
#Next,lets have a look at our data

def data_sum(dataframe):
    nulls=[]
    count=[]
    unique=[]
    null_percentage=[]
    dtype=[]

    for col in dataframe.columns:
        nulls.append(dataframe[col].isnull().sum())
        count.append(dataframe[col].count())
        unique.append(dataframe[col].nunique())
        total_rows=dataframe.shape[0]
        null_percentage.append(dataframe[col].isnull().sum()*100/total_rows)
        dtype.append(dataframe[col].dtype)
    
    summary=pd.DataFrame({
        'Column': dataframe.columns,
        'Nulls': nulls,
        'Non-Null Count': count,
        'Unique Values': unique,
        'Null Percentage': null_percentage,
        'Data Type': dtype
    })

    return(summary)

X_train_summary=data_sum(X_train)
print(X_train_summary)

          Column  Nulls  Non-Null Count  Unique Values  Null Percentage  \
0    PassengerId      0            6085           6085         0.000000   
1     HomePlanet    140            5945              3         2.300740   
2      CryoSleep    154            5931              2         2.530813   
3          Cabin    142            5943           4860         2.333607   
4    Destination    122            5963              3         2.004930   
5            Age    129            5956             80         2.119967   
6            VIP    153            5932              2         2.514380   
7    RoomService    114            5971           1026         1.873459   
8      FoodCourt    122            5963           1216         2.004930   
9   ShoppingMall    146            5939            891         2.399343   
10           Spa    122            5963           1077         2.004930   
11        VRDeck    129            5956           1050         2.119967   
12          Name    138  

In [28]:
#Lets first drop some colums we wont be using
X_train=X_train.drop(columns=['PassengerId','Name'])

In [29]:
X_train.corr()

  X_train.corr()


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
Age,1.0,0.063455,0.13177,0.03598,0.127165,0.100089
RoomService,0.063455,1.0,-0.016009,0.042314,0.003144,-0.018663
FoodCourt,0.13177,-0.016009,1.0,-0.008833,0.20927,0.242087
ShoppingMall,0.03598,0.042314,-0.008833,1.0,0.022235,0.002684
Spa,0.127165,0.003144,0.20927,0.022235,1.0,0.173287
VRDeck,0.100089,-0.018663,0.242087,0.002684,0.173287,1.0


First lets split into deck num side 

In [30]:
X_train[["Deck","Num","Side"]]=X_train["Cabin"].str.split("/",expand = True)

In [31]:
#Now im trying to impute using interpolation
X_train[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']] = X_train[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].interpolate(method='linear')

In [32]:
#lets see how the dataframe is like now
X_train_2_summary=data_sum(X_train)
print(X_train_2_summary)

          Column  Nulls  Non-Null Count  Unique Values  Null Percentage  \
0     HomePlanet    140            5945              3         2.300740   
1      CryoSleep    154            5931              2         2.530813   
2          Cabin    142            5943           4860         2.333607   
3    Destination    122            5963              3         2.004930   
4            Age      0            6085            114         0.000000   
5            VIP    153            5932              2         2.514380   
6    RoomService      0            6085           1060         0.000000   
7      FoodCourt      0            6085           1266         0.000000   
8   ShoppingMall      0            6085            938         0.000000   
9            Spa      0            6085           1114         0.000000   
10        VRDeck      0            6085           1101         0.000000   
11          Deck    142            5943              8         2.333607   
12           Num    142  

Looks like we have completed the imputing for numerical features now lets work on the categorical features

Everything above is what I feel is necessary, the bottom code im not sure which u wanna keep so largely advised to keep working from here

Saw above code - 



Train & Test:

Generally, Age, FoodCourt, SPA, VRDeck shows best positive correlation with one another

For negative correlation, 






Lasso Regression, L1/L2 Regression

In [33]:
missing = test['Age'].isna()
feature = test.columns

def blank_data(file, locate):
    return test.loc[missing, :]
blank_data(test, missing)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
42,0100_01,Earth,False,G/13/P,TRAPPIST-1e,,,0.0,17.0,997.0,0.0,0.0,Dary Cochrisons
68,0156_01,Mars,,F/34/P,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Tures Upead
146,0319_02,Mars,True,F/69/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Pigars Prie
157,0339_05,Earth,True,G/56/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Davisy Colleruces
211,0467_02,Mars,True,D/17/P,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,,Cings Keen
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4180,9065_01,Mars,True,F/1755/S,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Jet Bart
4216,9147_01,Earth,True,G/1490/P,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Gabrin Meyersones
4229,9177_02,Europa,True,C/306/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Zedarga Vablug
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore


**Arden's Edit:*

Step 1.1: Define X & Y

Step 1.2: Split Cabin into 3 separate columns

Step 1.3: Splitting dataset into train and validation

Step 2.1: Seeing which columns in X_train are in need of imputation/deletion

Find out how to impute a catagorical column based on other columns

I noticed that Eur

Imputation done by Aus.
 
According to ChatGPT take, 
1. Generally impute training dataset, then apply imputation method to test dataset
2. Missing datas are considered Missing At Random (MAR). 
Multiple Imputation, Regression, KNN, EM, Hot Deck were recommended. 
Among them, Regression is recommended although you must consider other techniques at own discretion.

My take: 
1. Generally, Age, FoodCourt, SPA, VRDeck shows best positive correlation with one another.
For negative correlation, train and test differs. It is seen that RoomService, VRDeck, FoodCourt shows best and consistent negative correlation with one another.
Perhaps they are multivariate data.

2. I would go impute Age, FoodCourt, SPA, VRDeck, RoomService , ShoppingMall. Removed some columns as to explore for any surprising results.
Currently, used Regression Imputation.

Not sure why info() and other things are not updated, despite dropping, adding etc.....

In [34]:
# Trying Regression Imputation.
# Assuming X is your DataFrame
X_train[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']] = X_train[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].interpolate(method='linear')
X_train

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
3032,Europa,False,B/120/S,TRAPPIST-1e,43.0,False,0.0,1440.0,0.0,85.0,150.0,B,120,S
7757,Europa,True,C/273/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,C,273,P
1795,Earth,False,G/300/S,TRAPPIST-1e,46.0,False,8.0,652.0,0.0,5.0,90.0,G,300,S
1702,Earth,False,F/346/S,TRAPPIST-1e,33.0,False,0.0,763.0,8.0,2.0,30.0,F,346,S
6634,Earth,False,F/1334/S,55 Cancri e,24.0,False,0.0,58.0,618.0,0.0,41.0,F,1334,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,Earth,,G/988/S,TRAPPIST-1e,18.0,False,14.0,2.0,144.0,610.0,0.0,G,988,S
5191,Mars,False,F/1063/S,TRAPPIST-1e,50.0,,690.0,0.0,30.0,762.0,428.0,F,1063,S
5390,Earth,False,F/1194/P,PSO J318.5-22,22.0,False,158.0,0.0,476.0,0.0,26.0,F,1194,P
860,Mars,False,F/191/P,TRAPPIST-1e,34.0,False,379.0,0.0,1626.0,0.0,0.0,F,191,P


In [35]:
# Evaluating the Correlation Metrics to a model. Feel free to put in yourthought
delete =X_train.drop(columns = ['PassengerId', 'HomePlanet','CryoSleep','Destination','VIP', 'Deck', 'Num', 'Side'])
X_train.info()
# Imputed the amenities.

KeyError: "['PassengerId'] not found in axis"

Finding causation of an event. 
I would think that Random Forest/Bagged Tree, LogisticsRegression are possible to use.

In [None]:
random_forest = RandomForestRegressor()

Logistics Regression. Goal is to look at how to encode accordingly

In [None]:
#sns.heatmap(X_test.isna())

now will proceed to impute the validation dataset using Austin's imputation method.

In [None]:

X_valid[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']] = X_valid[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].interpolate(method='linear')
X_valid

now will do the same to X_test

In [None]:

X_test[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']] = X_test[['Age', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'RoomService']].interpolate(method='linear')
X_test

i think ill remove the name column as it is unnecessary

In [None]:
X_train["VIP"].fillna(False,inplace=True)
X_valid["VIP"].fillna(False,inplace=True)
X_test["VIP"].fillna(False,inplace=True)



In [None]:
X_train.loc[X_train["Destination"]=="PSO J318.5-22","HomePlanet"]=X_train.loc[X_train["Destination"]=="PSO J318.5-22","HomePlanet"].fillna("Earth")
X_valid.loc[X_valid["Destination"]=="PSO J318.5-22","HomePlanet"]=X_valid.loc[X_valid["Destination"]=="PSO J318.5-22","HomePlanet"].fillna("Earth")
test.loc[test["Destination"]=="PSO J318.5-22","HomePlanet"]=test.loc[test["Destination"]=="PSO J318.5-22","HomePlanet"].fillna("Earth")


In [None]:
X_train["TotalSpending"]=X_train[["RoomService","Spa","FoodCourt","VRDeck",'ShoppingMall']].sum(axis=1)
X_valid["TotalSpending"]=X_valid[["RoomService","Spa","FoodCourt","VRDeck",'ShoppingMall']].sum(axis=1)
test["TotalSpending"]=test[["RoomService","Spa","FoodCourt","VRDeck",'ShoppingMall']].sum(axis=1)

In [None]:
X_train.loc[X_train["TotalSpending"]>=5000,"CryoSleep"]=X_train.loc[X_train["TotalSpending"]>=5000,"CryoSleep"].fillna(False)
X_train.loc[X_train["TotalSpending"]==0,"CryoSleep"]=X_train.loc[X_train["TotalSpending"]==0,"CryoSleep"].fillna(True)
X_train["CryoSleep"]=X_train["CryoSleep"].fillna(False)
X_valid.loc[X_valid["TotalSpending"]>=5000,"CryoSleep"]=X_valid.loc[X_valid["TotalSpending"]>=5000,"CryoSleep"].fillna(False)
X_valid.loc[X_valid["TotalSpending"]==0,"CryoSleep"]=X_valid.loc[X_valid["TotalSpending"]==0,"CryoSleep"].fillna(True)
X_valid["CryoSleep"]=X_valid["CryoSleep"].fillna(False)
test.loc[test["TotalSpending"]>=5000,"CryoSleep"]=test.loc[test["TotalSpending"]>=5000,"CryoSleep"].fillna(False)
test.loc[test["TotalSpending"]==0,"CryoSleep"]=test.loc[test["TotalSpending"]==0,"CryoSleep"].fillna(True)
test["CryoSleep"]=test["CryoSleep"].fillna(False)


In [None]:
sns.heatmap(X_train.isna())

In [None]:
Home_Dest=X_train.groupby("HomePlanet")['Destination'].value_counts()
Home_Dest


In [None]:
X_train["HomePlanet"].isna().sum()

In [None]:
X_train.loc[X_train['Deck']=='A','HomePlanet']=X_train.loc[X_train['Deck']=='A','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='B','HomePlanet']=X_train.loc[X_train['Deck']=='B','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='C','HomePlanet']=X_train.loc[X_train['Deck']=='C','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='T','HomePlanet']=X_train.loc[X_train['Deck']=='T','HomePlanet'].fillna("Europa")
X_train.loc[X_train['Deck']=='G','HomePlanet']=X_train.loc[X_train['Deck']=='G','HomePlanet'].fillna("Earth")
X_valid.loc[X_valid['Deck']=='A','HomePlanet']=X_valid.loc[X_valid['Deck']=='A','HomePlanet'].fillna("Europa")
X_valid.loc[X_valid['Deck']=='B','HomePlanet']=X_valid.loc[X_valid['Deck']=='B','HomePlanet'].fillna("Europa")
X_valid.loc[X_valid['Deck']=='C','HomePlanet']=X_valid.loc[X_valid['Deck']=='C','HomePlanet'].fillna("Europa")
X_valid.loc[X_valid['Deck']=='T','HomePlanet']=X_valid.loc[X_valid['Deck']=='T','HomePlanet'].fillna("Europa")
X_valid.loc[X_valid['Deck']=='G','HomePlanet']=X_valid.loc[X_valid['Deck']=='G','HomePlanet'].fillna("Earth")
test.loc[test['Deck']=='A','HomePlanet']=test.loc[test['Deck']=='A','HomePlanet'].fillna("Europa")
test.loc[test['Deck']=='B','HomePlanet']=test.loc[test['Deck']=='B','HomePlanet'].fillna("Europa")
test.loc[test['Deck']=='C','HomePlanet']=test.loc[test['Deck']=='C','HomePlanet'].fillna("Europa")
test.loc[test['Deck']=='T','HomePlanet']=test.loc[test['Deck']=='T','HomePlanet'].fillna("Europa")
test.loc[test['Deck']=='G','HomePlanet']=test.loc[test['Deck']=='G','HomePlanet'].fillna("Earth")

In [None]:
b=X_train.groupby("HomePlanet")['Deck'].value_counts()
b

In [None]:
sns.violinplot(x='HomePlanet',y='TotalSpending',data=X_train,hue="Destination")

In [None]:
X_train.groupby("Destination")['CryoSleep'].value_counts()

In [None]:
X_train.loc[X_train['HomePlanet']=="Mars",'Destination']=X_train.loc[X_train['HomePlanet']=="Mars",'Destination'].fillna("TRAPPIST-1e")