In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Machine Learning/Data/spaceship/train (1).csv')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Machine Learning/Data/spaceship/test (1).csv')

In [None]:
data.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [None]:
# Splitting the PassengerId into GroupId and GroupMemberNumber
data[['GroupId', 'GroupMemberNumber']] = data['PassengerId'].str.split('_', expand=True)

# Converting the new columns to the appropriate types
data['GroupId'] = data['GroupId'].astype(int)
data['GroupMemberNumber'] = data['GroupMemberNumber'].astype(int)

# Checking the changes
data[['PassengerId', 'GroupId', 'GroupMemberNumber']].head()


Unnamed: 0,PassengerId,GroupId,GroupMemberNumber
0,0013_01,13,1
1,0018_01,18,1
2,0019_01,19,1
3,0021_01,21,1
4,0023_01,23,1


In [None]:
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,GroupMemberNumber
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,1
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,1
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,1
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,1


In [None]:
data['HomePlanet'].value_counts()

Earth     2263
Europa    1002
Mars       925
Name: HomePlanet, dtype: int64

In [None]:
# First, we find the mode of the HomePlanet for cases where it might be needed
homeplanet_mode = data['HomePlanet'].mode()[0]

# Group the data by GroupId and then apply a lambda function to fill missing HomePlanet values within each group
data['HomePlanet'] = data.groupby('GroupId')['HomePlanet'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill').fillna(homeplanet_mode))

# Check if there are still any missing values in HomePlanet after this operation
data['HomePlanet'].isnull().sum(), data[['GroupId', 'HomePlanet']].head(10)


(0,
    GroupId HomePlanet
 0       13      Earth
 1       18      Earth
 2       19     Europa
 3       21     Europa
 4       23      Earth
 5       27      Earth
 6       29     Europa
 7       32     Europa
 8       32     Europa
 9       33      Earth)

In [None]:
data.isna().sum()

PassengerId            0
HomePlanet             0
CryoSleep             93
Cabin                100
Destination           92
Age                   91
VIP                   93
RoomService           82
FoodCourt            106
ShoppingMall          98
Spa                  101
VRDeck                80
Name                  94
GroupId                0
GroupMemberNumber      0
dtype: int64

In [None]:
# One-hot encoding the 'HomePlanet' column
homeplanet_encoded = pd.get_dummies(data['HomePlanet'], prefix='HomePlanet')

# Joining the encoded dataframe with the original data
data_encoded = data.join(homeplanet_encoded)

# Displaying the first few rows of the new dataframe with the encoded HomePlanet column
data_encoded[['HomePlanet', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars']].head()


Unnamed: 0,HomePlanet,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars
0,Earth,1,0,0
1,Earth,1,0,0
2,Europa,0,1,0
3,Europa,0,1,0
4,Earth,1,0,0


In [None]:
# Convert CryoSleep to numerical for correlation analysis
data_encoded['CryoSleep'] = data_encoded['CryoSleep'].map({False: 0, True: 1})

# Calculate the correlation between CryoSleep and the expenditure columns
expenditure_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
correlation_with_cryosleep = data_encoded[['CryoSleep'] + expenditure_columns].corr()['CryoSleep'].drop('CryoSleep')

correlation_with_cryosleep


RoomService    -0.275992
FoodCourt      -0.219777
ShoppingMall   -0.242136
Spa            -0.211110
VRDeck         -0.189852
Name: CryoSleep, dtype: float64

In [None]:
# First, we impute based on group information
# If at least one member in a group is not in cryosleep and has expenditures, we assume all members are not in cryosleep
group_cryosleep_impute = data_encoded.groupby('GroupId')['CryoSleep'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

# Next, we check if a passenger has zero expenditures across all amenities, if so and CryoSleep is still NaN, we impute as 1 (in cryosleep)
expenditures_sum = data_encoded[expenditure_columns].sum(axis=1)
cryosleep_impute_expenditures = group_cryosleep_impute.combine_first(data_encoded['CryoSleep'])
cryosleep_impute_expenditures = cryosleep_impute_expenditures.combine_first((expenditures_sum == 0).astype(int))

# Check if there are still any missing values in CryoSleep after these operations
missing_after_group_and_expenditures = cryosleep_impute_expenditures.isnull().sum()

# Applying the imputations to the original dataframe
data_encoded['CryoSleep'] = cryosleep_impute_expenditures

missing_after_group_and_expenditures, data_encoded[['GroupId', 'CryoSleep'] + expenditure_columns].head(10)


(0,
    GroupId  CryoSleep  RoomService  FoodCourt  ShoppingMall     Spa  VRDeck
 0       13        1.0          0.0        0.0           0.0     0.0     0.0
 1       18        0.0          0.0        9.0           0.0  2823.0     0.0
 2       19        1.0          0.0        0.0           0.0     0.0     0.0
 3       21        0.0          0.0     6652.0           0.0   181.0   585.0
 4       23        0.0         10.0        0.0         635.0     0.0     0.0
 5       27        0.0          0.0     1615.0         263.0   113.0    60.0
 6       29        1.0          0.0        NaN           0.0     0.0     0.0
 7       32        1.0          0.0        0.0           0.0     0.0     0.0
 8       32        1.0          0.0        0.0           0.0     0.0     0.0
 9       33        0.0          0.0      639.0           0.0     0.0     0.0)

In [None]:
data = data_encoded

In [None]:
data.isna().sum()

PassengerId            0
HomePlanet             0
CryoSleep              0
Cabin                100
Destination           92
Age                   91
VIP                   93
RoomService           82
FoodCourt            106
ShoppingMall          98
Spa                  101
VRDeck                80
Name                  94
GroupId                0
GroupMemberNumber      0
HomePlanet_Earth       0
HomePlanet_Europa      0
HomePlanet_Mars        0
dtype: int64

In [None]:
# Splitting the Cabin column into Cabin_Deck, Cabin_Num, and Cabin_Side
data[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = data['Cabin'].str.split('/', expand=True)

# Converting the Cabin_Num to numeric type
data['Cabin_Num'] = pd.to_numeric(data['Cabin_Num'], errors='coerce')

# Displaying the first few rows of the dataframe to verify the new columns
data[['Cabin', 'Cabin_Deck', 'Cabin_Num', 'Cabin_Side']].head()


Unnamed: 0,Cabin,Cabin_Deck,Cabin_Num,Cabin_Side
0,G/3/S,G,3.0,S
1,F/4/S,F,4.0,S
2,C/0/S,C,0.0,S
3,C/1/S,C,1.0,S
4,F/5/S,F,5.0,S


In [None]:
# Impute Cabin_Deck and Cabin_Side based on the mode within the same group if possible, otherwise use the overall mode
cabin_deck_mode = data['Cabin_Deck'].mode()[0]
cabin_side_mode = data['Cabin_Side'].mode()[0]

data['Cabin_Deck'] = data.groupby('GroupId')['Cabin_Deck'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill').fillna(cabin_deck_mode))
data['Cabin_Side'] = data.groupby('GroupId')['Cabin_Side'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill').fillna(cabin_side_mode))

# Check for remaining missing values in Cabin_Deck and Cabin_Side
missing_cabin_deck = data['Cabin_Deck'].isnull().sum()
missing_cabin_side = data['Cabin_Side'].isnull().sum()

missing_cabin_deck, missing_cabin_side


(0, 0)

In [None]:
# Checking the distribution of Cabin_Num to decide on mean or median for imputation
cabin_num_distribution = data['Cabin_Num'].describe()

# If the distribution is not heavily skewed, we will use the mean, otherwise, we'll use the median for imputation
cabin_num_mean = data['Cabin_Num'].mean()
cabin_num_median = data['Cabin_Num'].median()

# We will choose median or mean based on skewness
cabin_num_skewness = data['Cabin_Num'].skew()
imputation_value = cabin_num_median if cabin_num_skewness > 1 or cabin_num_skewness < -1 else cabin_num_mean

# Imputing the missing values in Cabin_Num with the chosen value
data['Cabin_Num'] = data['Cabin_Num'].fillna(imputation_value)

# Check for remaining missing values in Cabin_Num
missing_cabin_num = data['Cabin_Num'].isnull().sum()

cabin_num_distribution, cabin_num_skewness, missing_cabin_num


(count    4177.000000
 mean      610.178836
 std       514.968131
 min         0.000000
 25%       174.000000
 50%       442.000000
 75%      1027.000000
 max      1890.000000
 Name: Cabin_Num, dtype: float64,
 0.6839588837305568,
 0)

In [None]:
data.isna().sum()

PassengerId            0
HomePlanet             0
CryoSleep              0
Cabin                199
Destination          182
Age                  179
VIP                  203
RoomService          181
FoodCourt            183
ShoppingMall         208
Spa                  183
VRDeck               188
Name                 200
Transported            0
GroupId                0
GroupMemberNumber      0
HomePlanet_Earth       0
HomePlanet_Europa      0
HomePlanet_Mars        0
Cabin_Deck             0
Cabin_Num              0
Cabin_Side             0
dtype: int64

In [None]:


# Extracting GroupId from PassengerId again as it was done previously
data[['GroupId', 'GroupMemberNumber']] = data['PassengerId'].str.split('_', expand=True)

# First, find the mode of the Destination for cases where it might be needed
destination_mode = data['Destination'].mode()[0]

# Group the data by GroupId and then apply a lambda function to fill missing Destination values within each group
data['Destination'] = data.groupby('GroupId')['Destination'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill').fillna(destination_mode))

# Check if there are still any missing values in Destination after this operation
data['Destination'].isnull().sum(), data[['GroupId', 'Destination']].head(10)


(0,
   GroupId  Destination
 0    0013  TRAPPIST-1e
 1    0018  TRAPPIST-1e
 2    0019  55 Cancri e
 3    0021  TRAPPIST-1e
 4    0023  TRAPPIST-1e
 5    0027  TRAPPIST-1e
 6    0029  55 Cancri e
 7    0032  TRAPPIST-1e
 8    0032  55 Cancri e
 9    0033  55 Cancri e)

In [None]:
# Checking for correlations between Age and expenditure columns to see if there is a pattern
age_expenditure_correlation = data[['Age'] + expenditure_columns].corr()['Age'].drop('Age')

# Calculate the average age per group
group_ages = data.groupby('GroupId')['Age'].mean()

# Map the average age of the group to each passenger within the group
data['ImputedAge'] = data['GroupId'].map(group_ages)

# If there is an age available, we'll keep it, otherwise we'll use the imputed age
data['Age'] = data['Age'].fillna(data['ImputedAge'])

# If there are still missing values, use the overall median age as a fallback
overall_median_age = data['Age'].median()
data['Age'] = data['Age'].fillna(overall_median_age)

# Check if there are any missing values left
missing_ages = data['Age'].isnull().sum()

age_expenditure_correlation, missing_ages


(RoomService     0.064097
 FoodCourt       0.111073
 ShoppingMall    0.039249
 Spa             0.108344
 VRDeck          0.102743
 Name: Age, dtype: float64,
 0)

In [None]:
# Check the proportion of VIP statuses to see if mode imputation is reasonable
vip_counts = data['VIP'].value_counts(dropna=False)

# Perform group-based imputation for VIP status
data['VIP'] = data.groupby('GroupId')['VIP'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

# If there are still missing values after group-based imputation, use mode imputation
vip_mode = data['VIP'].mode()[0]
data['VIP'] = data['VIP'].fillna(vip_mode)

# Check if there are any missing values left in the VIP column
missing_vips = data['VIP'].isnull().sum()

vip_counts, missing_vips


  data['VIP'] = data.groupby('GroupId')['VIP'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


(False    4110
 NaN        93
 True       74
 Name: VIP, dtype: int64,
 0)

In [None]:
data.isna().sum()

PassengerId            0
HomePlanet             0
CryoSleep              0
Cabin                100
Destination            0
Age                    0
VIP                    0
RoomService           82
FoodCourt            106
ShoppingMall          98
Spa                  101
VRDeck                80
Name                  94
GroupId                0
GroupMemberNumber      0
HomePlanet_Earth       0
HomePlanet_Europa      0
HomePlanet_Mars        0
Cabin_Deck             0
Cabin_Num              0
Cabin_Side             0
ImputedAge            47
dtype: int64

In [None]:
data.dtypes

PassengerId           object
HomePlanet            object
CryoSleep            float64
Cabin                 object
Destination           object
Age                  float64
VIP                  float64
RoomService          float64
FoodCourt            float64
ShoppingMall         float64
Spa                  float64
VRDeck               float64
Name                  object
GroupId               object
GroupMemberNumber     object
HomePlanet_Earth       uint8
HomePlanet_Europa      uint8
HomePlanet_Mars        uint8
Cabin_Deck            object
Cabin_Num            float64
Cabin_Side            object
ImputedAge           float64
dtype: object

In [None]:
# Imputing missing values in expenditure columns with the mean of each column
for column in expenditure_columns:
    column_mean = data[column].mean()
    data[column] = data[column].fillna(column_mean)

# Check if there are any missing values left in the expenditure columns
missing_expenditures = data[expenditure_columns].isnull().sum()

missing_expenditures


RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [None]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'GroupId', 'GroupMemberNumber', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Cabin_Deck', 'Cabin_Num',
       'Cabin_Side', 'ImputedAge'],
      dtype='object')

In [None]:
destination_encoded = pd.get_dummies(data['Destination'], prefix='Destination')

# Joining the encoded dataframe with the original data
data = data.join(destination_encoded)

In [None]:
data.dtypes

PassengerId                   object
HomePlanet                    object
CryoSleep                    float64
Cabin                         object
Destination                   object
Age                          float64
VIP                          float64
RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
Name                          object
GroupId                       object
GroupMemberNumber             object
HomePlanet_Earth               uint8
HomePlanet_Europa              uint8
HomePlanet_Mars                uint8
Cabin_Deck                    object
Cabin_Num                    float64
Cabin_Side                    object
ImputedAge                   float64
Destination_55 Cancri e        uint8
Destination_PSO J318.5-22      uint8
Destination_TRAPPIST-1e        uint8
dtype: object

In [None]:
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_Deck,Cabin_Num,Cabin_Side,ImputedAge,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0013_01,Earth,1.0,G/3/S,TRAPPIST-1e,27.0,0.0,0.0,0.0,0.0,...,1,0,0,G,3.000000,S,27.0,0,0,1
1,0018_01,Earth,0.0,F/4/S,TRAPPIST-1e,19.0,0.0,0.0,9.0,0.0,...,1,0,0,F,4.000000,S,19.0,0,0,1
2,0019_01,Europa,1.0,C/0/S,55 Cancri e,31.0,0.0,0.0,0.0,0.0,...,0,1,0,C,0.000000,S,31.0,1,0,0
3,0021_01,Europa,0.0,C/1/S,TRAPPIST-1e,38.0,0.0,0.0,6652.0,0.0,...,0,1,0,C,1.000000,S,38.0,0,0,1
4,0023_01,Earth,0.0,F/5/S,TRAPPIST-1e,20.0,0.0,10.0,0.0,635.0,...,1,0,0,F,5.000000,S,20.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,1.0,G/1496/S,TRAPPIST-1e,34.0,0.0,0.0,0.0,0.0,...,1,0,0,G,1496.000000,S,37.0,0,0,1
4273,9269_01,Earth,0.0,,TRAPPIST-1e,42.0,0.0,0.0,847.0,17.0,...,1,0,0,F,610.178836,S,42.0,0,0,1
4274,9271_01,Mars,1.0,D/296/P,55 Cancri e,26.0,0.0,0.0,0.0,0.0,...,0,0,1,D,296.000000,P,,1,0,0
4275,9273_01,Europa,0.0,D/297/P,TRAPPIST-1e,26.0,0.0,0.0,2680.0,0.0,...,0,1,0,D,297.000000,P,,0,0,1


In [None]:
# One-hot encoding the 'Cabin_Deck' and 'Cabin_Side' columns
cabin_deck_encoded = pd.get_dummies(data['Cabin_Deck'], prefix='Cabin_Deck')
cabin_side_encoded = pd.get_dummies(data['Cabin_Side'], prefix='Cabin_Side')

# Joining the encoded dataframes with the original data
data = data.join(cabin_deck_encoded)
data = data.join(cabin_side_encoded)

# Displaying the first few rows to verify the encoding
encoded_columns = list(cabin_deck_encoded.columns) + list(cabin_side_encoded.columns)
data[encoded_columns].head()


Unnamed: 0,Cabin_Deck_A,Cabin_Deck_B,Cabin_Deck_C,Cabin_Deck_D,Cabin_Deck_E,Cabin_Deck_F,Cabin_Deck_G,Cabin_Deck_T,Cabin_Side_P,Cabin_Side_S
0,0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,0,1,0,0,0,1
2,0,0,1,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,1


In [None]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'GroupId', 'GroupMemberNumber', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Cabin_Deck', 'Cabin_Num',
       'Cabin_Side', 'ImputedAge', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Cabin_Deck_A',
       'Cabin_Deck_B', 'Cabin_Deck_C', 'Cabin_Deck_D', 'Cabin_Deck_E',
       'Cabin_Deck_F', 'Cabin_Deck_G', 'Cabin_Deck_T', 'Cabin_Side_P',
       'Cabin_Side_S'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Specifying the feature columns to be used
feature_columns = [
    'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupId',
    'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars','Cabin_Deck_A', 'Cabin_Deck_B',
       'Cabin_Deck_C', 'Cabin_Deck_D', 'Cabin_Deck_E', 'Cabin_Deck_F',
       'Cabin_Deck_G', 'Cabin_Deck_T', 'Cabin_Side_P', 'Cabin_Side_S',
    'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e'
]

# Since we have already preprocessed the data, the ColumnTransformer is not necessary,
# but it is included here for completeness and future use if needed.
preprocessor = ColumnTransformer(
    transformers=[
        # If future preprocessing was needed, it would go here
    ],
    remainder='passthrough'  # This allows us to keep columns that don't need preprocessing
)

# Creating the pipeline with a RandomForestClassifier as the estimator
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Split the data into features and target
X = data[feature_columns]
y = data['Transported']  # Assuming 'Transported' is the target variable

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# We can now use pipeline.predict(X_test) to make predictions on the test set,
# or use pipeline to perform cross-validation or grid search for hyperparameter tuning.
# Output the first few predictions as a check
test_predictions = pipeline.predict(X_test.head())
test_predictions


array([False,  True,  True, False,  True])

Test

In [None]:
X = data[feature_columns]

In [None]:
data.shape

(4277, 35)

In [None]:
test_predictions1.shape

(4277,)

In [None]:
test_predictions1 = pipeline.predict(X)

In [None]:
test_predictions1

array([False,  True, False, ...,  True, False,  True])

In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
test_predictions = pipeline.predict(X_test)

# Calculate the accuracy of the model on the test set
accuracy = accuracy_score(y_test, test_predictions)

# Output the accuracy
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.7895


In [None]:
data1 = data

In [None]:
submission_df = pd.DataFrame({
    'PassengerId': data['PassengerId'],
    'Transported': test_predictions1
})

# Convert the boolean predictions to strings if necessary
# submission_df['Transported'] = submission_df['Transported'].astype(str)

# Save the submission file
submission_df.to_csv('/content/drive/MyDrive/Machine Learning/Data/spaceship/submission4.csv', index=False)