In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
train_df = pd.read_csv('../Dataset/train.csv')
test_df = pd.read_csv('../Dataset/test.csv')

In [3]:
for df in [train_df, test_df]:
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['Num'] = pd.to_numeric(df['Cabin'].str.split('/').str[1], errors='coerce')
    df['Side'] = df['Cabin'].str.split('/').str[2]
    df.drop(columns=['Cabin'], inplace=True)

money_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for df in [train_df, test_df]:
    df['TotalSpend'] = df[money_features].sum(axis=1)

drop_cols = ['PassengerId', 'Name']
X = train_df.drop(columns=['Transported'] + drop_cols)
y = train_df['Transported'].astype(int)
X_test = test_df.drop(columns=drop_cols)

numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num', 'TotalSpend']
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

for col in numeric_features + cat_features:
    if col not in X_test.columns:
        X_test[col] = np.nan

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_features)
    ]
)

clf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=500, max_depth=15, random_state=42))
])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
clf.fit(X_train, y_train)

y_val_pred = clf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

y_pred = clf.predict(X_test)

submission_df = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Transported": y_pred.astype(bool)
})
submission_df.to_csv('../submissions/submission_final.csv', index=False)
print("Submission file created: submission_final.csv")


Validation Accuracy: 0.8004600345025877
Submission file created: submission_final.csv
