<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
Random Forest Regression 
</p>

In [2]:
# Libraries for data manipulation and visualization
import numpy as np                               # For numerical operations
import pandas as pd                              # For data manipulation
import matplotlib.pyplot as plt                  # For plotting
import seaborn as sns                            # For advanced data visualization

# Libraries for model building and evaluation
from sklearn.ensemble import RandomForestRegressor  # For Random Forest Regression
from sklearn.metrics import (                   # For model evaluation metrics
    mean_squared_error, r2_score, mean_absolute_percentage_error, median_absolute_error
)

<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
Dataset: Load Splits 
</p>

In [4]:
# Path to the standardized database directory
base_path = '../../Comprehensive ML - Files & Plots etc.'

# Load train and test splits
df_train = pd.read_csv(f"{base_path}/train.csv")
df_test = pd.read_csv(f"{base_path}/test.csv")

feature_names = [
    'distance', 'frequency', 'c_walls', 'w_walls', 'co2', 'humidity', 
    'pm25', 'pressure', 'temperature', 'snr'
]

X_train = df_train[feature_names].values
y_train = df_train['PL'].values
X_test = df_test[feature_names].values
y_test = df_test['PL'].values

# (Optional: For plotting)
time_train = df_train['time'].values
time_test = df_test['time'].values

# Load 5-fold assignments
fold_assignments = np.load(f"{base_path}/train_folds.npy")

print(f"\nTraining samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")
unique, counts = np.unique(fold_assignments, return_counts=True)
print(dict(zip(unique, counts)))
print('\nDataset loaded successfully!\n')


Training samples: 1132523, Test samples: 283131
{np.int64(0): np.int64(226505), np.int64(1): np.int64(226505), np.int64(2): np.int64(226505), np.int64(3): np.int64(226504), np.int64(4): np.int64(226504)}

Dataset loaded successfully!



<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
Dataset: Define the RF model parameters 
</p>

In [6]:
def create_rf_model():
    """Creates a Random Forest model with fixed parameters."""
    return RandomForestRegressor(
        n_estimators=300,
        criterion='squared_error',
        max_depth=100,
        max_features='log2',
        random_state=50,
        n_jobs=-1,
        min_samples_split=1000,
        min_samples_leaf=1000,
        min_impurity_decrease=0.001,
        bootstrap=True,
        oob_score=True
    )

<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 5-Fold Cross-Validation Using Saved Folds 
</p>

In [8]:
cv_results = []

for fold_num in range(5):
    print(f"\nTraining fold {fold_num+1}...")
    tr_idx = np.where(fold_assignments != fold_num)[0]
    val_idx = np.where(fold_assignments == fold_num)[0]
    X_tr, y_tr = X_train[tr_idx], y_train[tr_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]

    rf_model_cv = create_rf_model()
    rf_model_cv.fit(X_tr, y_tr)

    y_val_pred = rf_model_cv.predict(X_val)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_rmse = np.sqrt(val_mse)
    val_r2 = r2_score(y_val, y_val_pred)
    val_mape = mean_absolute_percentage_error(y_val, y_val_pred)
    val_median_ae = median_absolute_error(y_val, y_val_pred)

    cv_results.append({
        'Fold': fold_num+1,
        'Validation Loss (MSE)': round(val_mse, 4),
        'Validation RMSE': round(val_rmse, 4),
        'R² Score': round(val_r2, 4),
        'Validation MAPE (%)': round(val_mape * 100, 2),
        'Validation Median AE': round(val_median_ae, 4)
    })

    print(f"Fold {fold_num+1} - MSE: {val_mse:.4f}, RMSE: {val_rmse:.4f}, R²: {val_r2:.4f}, MAPE: {val_mape*100:.2f}%, Median AE: {val_median_ae:.4f}")

cv_results_df = pd.DataFrame(cv_results)

print("\nK-Fold Cross-Validation Results:")
display(cv_results_df)

# CV summary
cv_summary = cv_results_df.agg(['mean', 'std']).round(4).reset_index()
cv_summary.rename(columns={'index': 'Metric'}, inplace=True)
cv_summary_transposed = cv_summary.set_index('Metric').T

print("\nCross-Validation Summary:")
display(cv_summary_transposed)


Training fold 1...
Fold 1 - MSE: 23.0726, RMSE: 4.8034, R²: 0.9368, MAPE: 4.14%, Median AE: 2.8011

Training fold 2...
Fold 2 - MSE: 23.2387, RMSE: 4.8207, R²: 0.9364, MAPE: 4.14%, Median AE: 2.8028

Training fold 3...
Fold 3 - MSE: 23.1103, RMSE: 4.8073, R²: 0.9364, MAPE: 4.13%, Median AE: 2.7925

Training fold 4...
Fold 4 - MSE: 23.0270, RMSE: 4.7986, R²: 0.9368, MAPE: 4.13%, Median AE: 2.7952

Training fold 5...
Fold 5 - MSE: 23.2019, RMSE: 4.8168, R²: 0.9362, MAPE: 4.14%, Median AE: 2.8057

K-Fold Cross-Validation Results:


Unnamed: 0,Fold,Validation Loss (MSE),Validation RMSE,R² Score,Validation MAPE (%),Validation Median AE
0,1,23.0726,4.8034,0.9368,4.14,2.8011
1,2,23.2387,4.8207,0.9364,4.14,2.8028
2,3,23.1103,4.8073,0.9364,4.13,2.7925
3,4,23.027,4.7986,0.9368,4.13,2.7952
4,5,23.2019,4.8168,0.9362,4.14,2.8057



Cross-Validation Summary:


Metric,mean,std
Fold,3.0,1.5811
Validation Loss (MSE),23.1301,0.0884
Validation RMSE,4.8094,0.0092
R² Score,0.9365,0.0003
Validation MAPE (%),4.136,0.0055
Validation Median AE,2.7995,0.0055


<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Train on All Training Data and Test Evaluation 
</p>

In [10]:
# Train the RF model on all training data
rf_model = create_rf_model()
print("\nTraining Random Forest model on all training data...")
rf_model.fit(X_train, y_train)
print("\nTraining Completed!\n")

# OOB score
oob_score = rf_model.oob_score_
print(f"OOB Score (train set): {oob_score:.4f}\n")

# Predict on train and test sets
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Metrics
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
test_median_ae = median_absolute_error(y_test, y_test_pred)

results = pd.DataFrame({
    'Metric': [
        'Training Loss (MSE)', 'Test Loss (MSE)', 'Test RMSE',
        'R² Score (Test)', 'Test MAPE (%)', 'Test Median AE'
    ],
    'Value': [
        train_mse, test_mse, test_rmse, test_r2,
        test_mape * 100, test_median_ae
    ]
})

print("\nModel Evaluation Metrics:")
display(results)


Training Random Forest model on all training data...

Training Completed!

OOB Score (train set): 0.9390


Model Evaluation Metrics:


Unnamed: 0,Metric,Value
0,Training Loss (MSE),22.052392
1,Test Loss (MSE),22.018479
2,Test RMSE,4.692385
3,R² Score (Test),0.939319
4,Test MAPE (%),4.033177
5,Test Median AE,2.738249
