In [1]:
import joblib
import numpy as np

from sklearn.ensemble import RandomForestRegressor

X_train_clean = joblib.load("X_train_clean.joblib")
y_train = joblib.load("y_train.joblib")



In [2]:
def RMSE_manual(actual_test_target, predicted_test_target):
    squared_error =(actual_test_target - predicted_test_target) ** 2
    mean_squared_error = np.mean(squared_error)
    return np.sqrt(mean_squared_error)

In [3]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5,shuffle=True,random_state=42)
def kfold_cross_validation_rmse(model_class, feature_matrix, target_vector, num_folds = 5):
    kf = KFold(n_splits=num_folds,shuffle=True,random_state=42)

    rmse_scores = []

    fold_splits = enumerate(kf.split(feature_matrix))

    for fold_idx, (train_idx, validation_idx) in fold_splits:
        X_train_fold = feature_matrix.iloc[train_idx]
        y_train_fold = target_vector.iloc[train_idx]

        X_validation_fold = feature_matrix.iloc[validation_idx]
        y_validation = target_vector.iloc[validation_idx]

        model = model_class()
        model.fit(X_train_fold, y_train_fold)

        validation_predictions = model.predict(X_validation_fold)

        fold_rmse = RMSE_manual(y_validation, validation_predictions)
        rmse_scores.append(fold_rmse)

        print(f"Fold {fold_idx + 1} RMSE: {fold_rmse:,.2f}")

    return np.array(rmse_scores)



In [8]:
baseline_random_forest_maker = lambda: RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

baseline_random_forest_cv_rmse_scores = kfold_cross_validation_rmse(
    model_class=baseline_random_forest_maker,
    feature_matrix=X_train_clean,
    target_vector=y_train,
    num_folds=5
)

print("Baseline RF CV RMSE scores:", baseline_random_forest_cv_rmse_scores)
print("Mean RMSE:", baseline_random_forest_cv_rmse_scores.mean())
print("Std  RMSE:", baseline_random_forest_cv_rmse_scores.std())



Fold 1 RMSE: 50,868.53
Fold 2 RMSE: 48,402.06
Fold 3 RMSE: 48,960.74
Fold 4 RMSE: 50,091.42
Fold 5 RMSE: 47,844.97
Baseline RF CV RMSE scores: [50868.52856907 48402.06131535 48960.7389745  50091.41510096
 47844.97206875]
Mean RMSE: 49233.54320572474
Std  RMSE: 1104.8108808303384


In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
max_depth_values = [None, 30, 20, 15, 10, 7]

rf_depth_results = []

for depth in max_depth_values:
    print(f"\nTesting max_depth = {depth}")

    rf_with_depth_maker = lambda d=depth: RandomForestRegressor(
        n_estimators=300,
        max_depth=d,
        random_state=42,
        n_jobs=-1
    )

    rmse_scores = kfold_cross_validation_rmse(
        model_class=rf_with_depth_maker,
        feature_matrix=X_train_clean,
        target_vector=y_train,
        num_folds=5
    )

    rf_depth_results.append({
        "max_depth": depth,
        "mean_rmse": rmse_scores.mean(),
        "std_rmse": rmse_scores.std()
    })

rf_depth_df = pd.DataFrame(rf_depth_results)
rf_depth_df.sort_values("mean_rmse")




Testing max_depth = None
Fold 1 RMSE: 50,868.53
Fold 2 RMSE: 48,402.06
Fold 3 RMSE: 48,960.74
Fold 4 RMSE: 50,091.42
Fold 5 RMSE: 47,844.97

Testing max_depth = 30
Fold 1 RMSE: 50,835.28
Fold 2 RMSE: 48,448.77
Fold 3 RMSE: 48,924.59
Fold 4 RMSE: 50,075.19
Fold 5 RMSE: 47,813.45

Testing max_depth = 20
Fold 1 RMSE: 50,874.10
Fold 2 RMSE: 48,503.72
Fold 3 RMSE: 48,953.49
Fold 4 RMSE: 50,140.98
Fold 5 RMSE: 47,847.47

Testing max_depth = 15
Fold 1 RMSE: 51,484.32
Fold 2 RMSE: 49,078.66
Fold 3 RMSE: 49,518.72
Fold 4 RMSE: 50,686.82
Fold 5 RMSE: 48,319.60

Testing max_depth = 10
Fold 1 RMSE: 55,043.78
Fold 2 RMSE: 53,059.17
Fold 3 RMSE: 52,815.09
Fold 4 RMSE: 54,888.00
Fold 5 RMSE: 52,268.34

Testing max_depth = 7
Fold 1 RMSE: 61,882.16
Fold 2 RMSE: 59,947.68
Fold 3 RMSE: 59,310.75
Fold 4 RMSE: 62,106.78
Fold 5 RMSE: 59,977.77


Unnamed: 0,max_depth,mean_rmse,std_rmse
1,30.0,49219.455152,1095.530134
0,,49233.543206,1104.810881
2,20.0,49263.953679,1099.317838
3,15.0,49817.622551,1132.507389
4,10.0,53614.877003,1133.512327
5,7.0,60645.028281,1129.516742
