In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [14]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [15]:
print(train_df.info())
print(train_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
None
               Age   RoomService     FoodCourt  ShoppingMall           Spa  \
count  8514.000000   8512.000000   8510.000000   8485.00000

In [17]:
def preprocess_data(df):
    df.drop(columns=['Name', 'Cabin', 'PassengerId'], inplace=True, errors="ignore")
    
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    numeric_imputer = SimpleImputer(strategy='median')
    df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])
    
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])
    
    label_encoders = {}
    for col in categorical_cols:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col].astype(str))
    
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    return df

In [18]:
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [19]:
X = train_df.drop('Transported', axis=1)
y = train_df['Transported']

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [None]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
best_rf_model = grid_search.best_estimator_
print(f"Best RandomForest Parameters: {grid_search.best_params_}")

In [None]:
xgb_model = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
ensemble_model = VotingClassifier(estimators=[
    ('rf', best_rf_model),
    ('xgb', xgb_model)
], voting='soft')

In [None]:
ensemble_model.fit(X_train, y_train)

In [None]:
y_predict = ensemble_model.predict(X_val)
accuracy = accuracy_score(y_val, y_predict)
print(f"Validation Accuracy: {accuracy}")


In [None]:
cv_scores = cross_val_score(ensemble_model, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(cv_scores)}")

In [None]:
final_model = ensemble_model
final_model.fit(X, y)

In [None]:
test_pred = final_model.predict(test_df)

In [None]:
# submission = pd.read_csv('D:/University/4th Sem/PAI(LAB)/Class Work/LAB 2/spaceship-titanic/sample_submission.csv')
# submission['Transported'] = test_pred.astype(bool)
# submission.to_csv('submission.csv', index=False)
# print("Submission file created: submission.csv")