# Random Forest model


Add main directory to path to access utils folder


In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(parent_dir)

import modules

In [2]:
import numpy as np
import optuna
import pandas as pd
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from utils.machine_learning import Rounder

In [3]:
train_data = pd.read_pickle("../../data/train_processed.pkl")
train_data

Unnamed: 0,PassengerNum,Age,HomePlanet,Destination,CabinDeck,CabinSide,CryoSleep,VIP,RoomService,FoodCourt,...,YesShoppingMall,YesSpa,YesVRDeck,YesTotalSpending,LogRoomService,LogFoodCourt,LogShoppingMall,LogSpa,LogVRDeck,LogTotalSpending
0,01,39.0,Europa,TRAPPIST-1e,B,P,False,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,01,24.0,Earth,TRAPPIST-1e,F,S,False,False,109.0,9.0,...,True,True,True,True,4.700480,2.302585,3.258097,6.309918,3.806662,6.602588
2,01,58.0,Europa,TRAPPIST-1e,A,S,False,True,43.0,3576.0,...,False,True,True,True,3.784190,8.182280,0.000000,8.812248,3.912023,9.248021
3,02,33.0,Europa,TRAPPIST-1e,A,S,False,False,0.0,1283.0,...,True,True,True,True,0.000000,7.157735,5.918894,8.110728,5.267858,8.551981
4,01,16.0,Earth,TRAPPIST-1e,F,S,False,False,303.0,70.0,...,True,True,True,True,5.717028,4.262680,5.023881,6.338594,1.098612,6.995766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,01,41.0,Europa,55 Cancri e,A,P,False,True,0.0,6819.0,...,False,True,True,True,0.000000,8.827615,0.000000,7.404888,4.317488,9.052165
8689,01,18.0,Earth,PSO J318.5-22,G,S,True,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8690,01,26.0,Earth,TRAPPIST-1e,G,S,False,False,0.0,0.0,...,True,True,False,True,0.000000,0.000000,7.535297,0.693147,0.000000,7.535830
8691,01,32.0,Europa,55 Cancri e,E,S,False,False,0.0,1049.0,...,False,True,True,True,0.000000,6.956545,0.000000,5.869297,8.082093,8.442039


In [4]:
df = train_data.copy()
X = df.drop(
    columns=[
        "Transported",
        "TotalSpending",
        "LogRoomService",
        "LogFoodCourt",
        "LogShoppingMall",
        "LogSpa",
        "LogVRDeck",
        "LogTotalSpending",
        "LogTotalSpending",
    ]
)
y = df["Transported"]
numerical_columns = list(X.select_dtypes(include="number").drop(columns="CabinBin"))
categorical_columns = list(X.select_dtypes(include=["object"]))
set_config(transform_output="pandas")

cat_pipeline = Pipeline([("one_hot", OneHotEncoder(sparse_output=False))])
num_pipeline = Pipeline(
    [
        ("imputer", IterativeImputer(random_state=0)),
        ("scaler", StandardScaler()),
    ]
)
ord_pipeline = Pipeline(
    [
        ("oe", OrdinalEncoder()),
        ("imputer", IterativeImputer(random_state=0)),
        ("rounder", Rounder(decimals=0)),
    ]
)
feature_preprocessing = ColumnTransformer(
    [
        ("cat", cat_pipeline, categorical_columns),
        ("num", num_pipeline, numerical_columns),
        ("ord", ord_pipeline, ["CabinBin"]),
    ],
    verbose_feature_names_out=False,
)

In [5]:
X_processed = feature_preprocessing.fit_transform(X)

In [6]:
model = RandomForestClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_processed, y, scoring="accuracy", cv=cv, n_jobs=-1)
mean_acc = np.mean(n_scores)
std_acc = np.std(n_scores)
print(f"Accuracy: {mean_acc:.3f} (std. dev. {std_acc:.3f})")

Accuracy: 0.791 (std. dev. 0.013)


In [7]:
def objective(trial, data=X_processed, target=y):

    data = data.copy()

    param = {
        "max_depth": trial.suggest_int("max_depth", 2, 32, log=True),
        "criterion": trial.suggest_categorical(
            "criterion", ["gini", "entropy", "log_loss"]
        ),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 32, log=True),
        "max_features": trial.suggest_categorical(
            "max_features", ["sqrt", "log2", None]
        ),
    }

    model = RandomForestClassifier(**param)
    return cross_val_score(
        model, data, target, scoring="accuracy", n_jobs=-1, cv=5
    ).mean()


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

trial = study.best_trial

print("Accuracy: {}".format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2024-10-17 23:38:25,372] A new study created in memory with name: no-name-bf19101a-bde0-4175-a519-e2f2e6bf071c
[I 2024-10-17 23:38:26,079] Trial 0 finished with value: 0.7364579328489913 and parameters: {'max_depth': 2, 'criterion': 'log_loss', 'min_samples_split': 7, 'max_features': 'log2'}. Best is trial 0 with value: 0.7364579328489913.
[I 2024-10-17 23:38:31,689] Trial 1 finished with value: 0.7980005836456145 and parameters: {'max_depth': 24, 'criterion': 'entropy', 'min_samples_split': 23, 'max_features': None}. Best is trial 1 with value: 0.7980005836456145.
[I 2024-10-17 23:38:32,388] Trial 2 finished with value: 0.7368030910718765 and parameters: {'max_depth': 2, 'criterion': 'entropy', 'min_samples_split': 7, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7980005836456145.
[I 2024-10-17 23:38:32,993] Trial 3 finished with value: 0.737493142825758 and parameters: {'max_depth': 3, 'criterion': 'gini', 'min_samples_split': 13, 'max_features': 'log2'}. Best is trial 1 

Accuracy: 0.8009915358151286
Best hyperparameters: {'max_depth': 13, 'criterion': 'entropy', 'min_samples_split': 9, 'max_features': None}


In [8]:
model = RandomForestClassifier(**study.best_params)
rf_pipeline = Pipeline(
    steps=[("preprocessor", feature_preprocessing), ("model", model)]
)

In [9]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cross_val_score(
    rf_pipeline, X, y, scoring="accuracy", n_jobs=-1, cv=5, error_score="raise"
).mean()

0.7959291710974986

In [10]:
import pickle

model = rf_pipeline
model.fit(X, y)

with open(
    "../../ML_models_trained/rf_model.pkl",
    "wb",
) as file:
    pickle.dump(model, file)