Number of estimators: 100
R-squared: 0.7561
Mean Absolute Error: 3.6204
--------------------------------------------------
Number of estimators: 200
R-squared: 0.7564
Mean Absolute Error: 3.6182
--------------------------------------------------
Number of estimators: 300
R-squared: 0.7573
Mean Absolute Error: 3.6121
--------------------------------------------------
Number of estimators: 400
R-squared: 0.7579
Mean Absolute Error: 3.6081
--------------------------------------------------
Number of estimators: 500
R-squared: 0.7579
Mean Absolute Error: 3.6075
--------------------------------------------------
Number of estimators: 600
R-squared: 0.7581
Mean Absolute Error: 3.6070
--------------------------------------------------
Number of estimators: 700
R-squared: 0.7582
Mean Absolute Error: 3.6064
--------------------------------------------------
Number of estimators: 800
R-squared: 0.7584
Mean Absolute Error: 3.6053
--------------------------------------------------
Number of estimators: 900
R-squared: 0.7584
Mean Absolute Error: 3.6053
--------------------------------------------------
Number of estimators: 1000
R-squared: 0.7583
Mean Absolute Error: 3.6063
--------------------------------------------------

In [None]:
# RANDOM FOREST - 1000 ESTIMATORS

import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import time

# Encode categorical columns using Label Encoding
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_cols = ['Weatherconditions', 'Road_traffic_density', 
                    'Type_of_order', 'Type_of_vehicle', 'Festival', 'City']

# Apply LabelEncoder to categorical columns
for col in tqdm(categorical_cols):
    train[col] = label_encoder.fit_transform(train[col])

# After encoding, check again for any non-numeric columns
non_numeric_columns_after = train.select_dtypes(include=['object']).columns
print(f"Remaining non-numeric columns after encoding: {non_numeric_columns_after}")

# Separate features (X) and target variable (y)
X = train.drop(columns=['Time_taken(min)'])  # Drop target variable
y = train['Time_taken(min)']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Loop through different numbers of estimators
for n_estimators in range(100, 1001, 100):
    # Create a Random Forest Regressor model with the current number of estimators
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=42)

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    # Print the results
    print(f"Number of estimators: {n_estimators}")
    print(f"R-squared: {r2:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print("-" * 50)  # Separator for readability