In [56]:
def clean_spaceship_data(df, is_test=False):
    df = df.drop(columns=['Name'])  # Drop non-informative column

    # Split Cabin into Deck / Num / Side
    df[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)

    # Split PassengerId into GroupId and PassengerNum
    df[['GroupId', 'PassengerNum']] = df['PassengerId'].str.split('_', expand=True)

    # Drop original Cabin column
    df.drop(columns=['Cabin'], inplace=True)

    # Convert to boolean
    df['CryoSleep'] = df['CryoSleep'].astype('boolean')
    df['VIP'] = df['VIP'].astype('boolean')

    # Fill missing categorical values with mode
    for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side']:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # Fill missing numerical values with median
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df[col].fillna(df[col].median(), inplace=True)

    # Drop rows with any remaining NaNs (only for training)
    if not is_test:
        df.dropna(inplace=True)
    else:
        # For test set, fill any remaining NaNs (fallback)
        df.fillna(0, inplace=True)

    # Type conversions
    df['GroupId'] = df['GroupId'].astype('category')
    df['PassengerNum'] = df['PassengerNum'].astype(int)

    return df


In [57]:
import pandas as pd


# Load raw data
train_raw = pd.read_csv(r'C:\Users\mages\spaceship\train.csv')
test_raw = pd.read_csv(r'C:\Users\mages\spaceship\test.csv')
sample_submission = pd.read_csv(r'C:\Users\mages\spaceship\sample_submission.csv')

# Clean both datasets
train_df_cleaned = clean_spaceship_data(train_raw.copy())
test_df_cleaned = clean_spaceship_data(test_raw.copy(), is_test=True)


In [58]:
train_df_cleaned.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Cabin_Deck,Cabin_Num,Cabin_Side,GroupId,PassengerNum
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P,1,1
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S,2,1
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S,3,1
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S,3,2
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S,4,1


In [59]:
test_df_cleaned.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_Deck,Cabin_Num,Cabin_Side,GroupId,PassengerNum
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,3,S,13,1
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,4,S,18,1
2,0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,0,S,19,1
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,1,S,21,1
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,5,S,23,1


In [60]:
train_missing = train_df_cleaned.isnull().sum()
print(train_missing)

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Cabin_Deck      0
Cabin_Num       0
Cabin_Side      0
GroupId         0
PassengerNum    0
dtype: int64


In [61]:
test_missing = test_df_cleaned.isnull().sum()
print(test_missing)

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Cabin_Deck      0
Cabin_Num       0
Cabin_Side      0
GroupId         0
PassengerNum    0
dtype: int64


In [62]:
from sklearn.preprocessing import LabelEncoder

# Save PassengerId before dropping
passenger_ids = test_df_cleaned['PassengerId'].values

# Drop PassengerId from feature sets
train_df_cleaned.drop(columns=['PassengerId'], inplace=True)
test_df_cleaned.drop(columns=['PassengerId'], inplace=True)

# Label encode GroupId
le = LabelEncoder()
all_groups = pd.concat([train_df_cleaned['GroupId'], test_df_cleaned['GroupId']])
le.fit(all_groups)
train_df_cleaned['GroupId'] = le.transform(train_df_cleaned['GroupId'])
test_df_cleaned['GroupId'] = le.transform(test_df_cleaned['GroupId'])

# One-hot encode categorical variables
cat_cols = ['HomePlanet', 'Destination', 'Cabin_Deck', 'Cabin_Side']
train_encoded = pd.get_dummies(train_df_cleaned, columns=cat_cols, drop_first=True)
test_encoded = pd.get_dummies(test_df_cleaned, columns=cat_cols, drop_first=True)

# Align test columns with train
X_train = train_encoded.drop('Transported', axis=1)
y_train = train_encoded['Transported']
X_test = test_encoded.reindex(columns=X_train.columns, fill_value=0)

# Check for NaNs before prediction
print("Any NaNs in X_test?", X_test.isnull().sum().sum())  # Should be 0


# Final shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)



Any NaNs in X_test? 0
X_train shape: (8494, 23)
y_train shape: (8494,)
X_test shape: (4277, 23)


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split training set for validation
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_tr, y_tr)

# Validate
y_pred_val = model.predict(X_val)
print("Validation Accuracy:", round(accuracy_score(y_val, y_pred_val), 4))
print("Classification Report:\n", classification_report(y_val, y_pred_val))

# Predict on test set
test_preds = model.predict(X_test)

# Build and save submission
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Transported': test_preds
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv' with shape:", submission.shape)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


Validation Accuracy: 0.8146
Classification Report:
               precision    recall  f1-score   support

       False       0.79      0.85      0.82       853
        True       0.84      0.78      0.81       846

    accuracy                           0.81      1699
   macro avg       0.82      0.81      0.81      1699
weighted avg       0.82      0.81      0.81      1699

Submission file saved as 'submission.csv' with shape: (4277, 2)
