# Using Scikit-learn's Random Forest Regressor

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
data = pd.read_csv("../data/processed/processed.csv")

In [3]:
data.head()

Unnamed: 0,Date,AQI,Year,Month,Day,temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code,sunshine_duration,cloud_cover_mean,wind_speed_10m_mean,winddirection_10m_dominant
0,2018-01-01,75.0,2018,1,1,28.253,14.503,0.0,0.0,36062.707,0.0,5.679092,270.6138
1,2018-01-02,76.0,2018,1,2,28.453,12.903,0.0,1.0,36040.188,7.166666,4.744073,278.32556
2,2018-01-03,79.0,2018,1,3,28.903,15.303,0.0,0.0,36101.605,0.083333,5.611268,285.8839
3,2018-01-04,84.0,2018,1,4,28.353,15.303,0.0,0.0,36083.652,1.541667,5.784296,326.56494
4,2018-01-05,97.0,2018,1,5,28.453,14.003,0.0,3.0,36066.5,46.125,5.50617,287.77652


In [5]:
# Selecting Features for Training

features = ["Year", "Month", "temperature_2m_max", "temperature_2m_min",
            "precipitation_sum", "weather_code", "sunshine_duration", "cloud_cover_mean",
           "wind_speed_10m_mean", "winddirection_10m_dominant"]

X = data[features]
y = data["AQI"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
print(f"Training data X Split Shape {X_train.shape}")
print(f"Training data Y Split Shape {y_train.shape}")

Training data X Split Shape (2045, 10)
Training data Y Split Shape (2045,)


In [8]:
# Initiating the model

model = RandomForestRegressor() # Using default n_estimators (100)

model.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
# Checking how well model performed while learning

model.score(X_train, y_train)

0.9531085783972929

In [14]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [15]:
# Evaluating performance

mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

In [16]:
print("\n--- Model Evaluation ---")
print(f"R-squared (R²): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f} AQI points")
print("------------------------")


--- Model Evaluation ---
R-squared (R²): 0.64
Mean Absolute Error (MAE): 15.71 AQI points
------------------------


### Inference
A MAE of 16 points is not bad, but we can definitely try to find a better combination of max_depth and n_esimators using cross validations