In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
import joblib

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("final_dataset.csv", parse_dates=True, index_col='time')
df.head()

Unnamed: 0_level_0,pm2_5,temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,city
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 00:00:00,17.6,22.6,74,0.0,7.4,mumbai
2022-01-01 01:00:00,17.6,21.7,78,0.0,7.1,mumbai
2022-01-01 02:00:00,17.6,21.5,77,0.0,10.5,mumbai
2022-01-01 03:00:00,17.6,21.4,73,0.0,10.6,mumbai
2022-01-01 04:00:00,17.6,20.9,73,0.0,9.8,mumbai


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 133728 entries, 2022-01-01 00:00:00 to 2025-10-24 23:00:00
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   pm2_5                133728 non-null  float64
 1   temperature_2m       133728 non-null  float64
 2   relativehumidity_2m  133728 non-null  int64  
 3   precipitation        133728 non-null  float64
 4   windspeed_10m        133728 non-null  float64
 5   city                 133728 non-null  object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.1+ MB


In [4]:
df[df["city"] == "bangalore"]

Unnamed: 0_level_0,pm2_5,temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,city
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 00:00:00,7.2,18.2,95,0.0,15.5,bangalore
2022-01-01 01:00:00,7.2,18.9,90,0.0,19.9,bangalore
2022-01-01 02:00:00,7.2,18.7,91,0.0,18.9,bangalore
2022-01-01 03:00:00,7.2,18.7,91,0.0,17.3,bangalore
2022-01-01 04:00:00,7.2,18.5,92,0.0,16.0,bangalore
...,...,...,...,...,...,...
2025-10-24 19:00:00,17.6,22.2,85,0.0,6.4,bangalore
2025-10-24 20:00:00,18.4,21.9,86,0.0,7.0,bangalore
2025-10-24 21:00:00,24.2,21.6,88,0.0,7.6,bangalore
2025-10-24 22:00:00,27.8,21.1,91,0.0,7.8,bangalore


In [5]:
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek
df['month'] = df.index.month
df.head()

Unnamed: 0_level_0,pm2_5,temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,city,hour,day_of_week,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-01 00:00:00,17.6,22.6,74,0.0,7.4,mumbai,0,5,1
2022-01-01 01:00:00,17.6,21.7,78,0.0,7.1,mumbai,1,5,1
2022-01-01 02:00:00,17.6,21.5,77,0.0,10.5,mumbai,2,5,1
2022-01-01 03:00:00,17.6,21.4,73,0.0,10.6,mumbai,3,5,1
2022-01-01 04:00:00,17.6,20.9,73,0.0,9.8,mumbai,4,5,1


In [6]:
df['pm2_5_lag_1h'] = df.groupby('city')['pm2_5'].shift(1)
df['pm2_5_lag_24h'] = df.groupby('city')['pm2_5'].shift(24)
for lag in [2, 3, 6, 12, 48]:
    df[f'pm2_5_lag_{lag}h'] = df.groupby('city')['pm2_5'].shift(lag)
df.head()

Unnamed: 0_level_0,pm2_5,temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,city,hour,day_of_week,month,pm2_5_lag_1h,pm2_5_lag_24h,pm2_5_lag_2h,pm2_5_lag_3h,pm2_5_lag_6h,pm2_5_lag_12h,pm2_5_lag_48h
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-01-01 00:00:00,17.6,22.6,74,0.0,7.4,mumbai,0,5,1,,,,,,,
2022-01-01 01:00:00,17.6,21.7,78,0.0,7.1,mumbai,1,5,1,17.6,,,,,,
2022-01-01 02:00:00,17.6,21.5,77,0.0,10.5,mumbai,2,5,1,17.6,,17.6,,,,
2022-01-01 03:00:00,17.6,21.4,73,0.0,10.6,mumbai,3,5,1,17.6,,17.6,17.6,,,
2022-01-01 04:00:00,17.6,20.9,73,0.0,9.8,mumbai,4,5,1,17.6,,17.6,17.6,,,


In [7]:
df['city'].value_counts()

city
mumbai       33432
delhi        33432
chennai      33432
bangalore    33432
Name: count, dtype: int64

In [8]:
df = pd.get_dummies(df, columns=['city'], drop_first=True)
df.tail()

Unnamed: 0_level_0,pm2_5,temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,hour,day_of_week,month,pm2_5_lag_1h,pm2_5_lag_24h,pm2_5_lag_2h,pm2_5_lag_3h,pm2_5_lag_6h,pm2_5_lag_12h,pm2_5_lag_48h,city_chennai,city_delhi,city_mumbai
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2025-10-24 19:00:00,17.6,22.2,85,0.0,6.4,19,4,10,16.3,22.9,12.1,6.8,4.0,18.3,8.0,False,False,False
2025-10-24 20:00:00,18.4,21.9,86,0.0,7.0,20,4,10,17.6,27.3,16.3,12.1,3.6,12.6,8.3,False,False,False
2025-10-24 21:00:00,24.2,21.6,88,0.0,7.6,21,4,10,18.4,32.8,17.6,16.3,4.7,9.3,8.1,False,False,False
2025-10-24 22:00:00,27.8,21.1,91,0.0,7.8,22,4,10,24.2,35.6,18.4,17.6,6.8,5.4,6.0,False,False,False
2025-10-24 23:00:00,20.0,20.9,93,0.0,6.3,23,4,10,27.8,34.2,24.2,18.4,12.1,4.7,4.7,False,False,False


In [9]:
print(f"Original data size: {len(df)}")
df.dropna(inplace=True)
print(f"Data size after dropping Null values: {len(df)}")

Original data size: 133728
Data size after dropping Null values: 133536


In [10]:
X = df.drop(['pm2_5'],axis=1)
y = df['pm2_5']

In [11]:
split_date = '2025-01-01'
print(f"\nSplitting data: Training before {split_date}, Testing on/after {split_date}")


Splitting data: Training before 2025-01-01, Testing on/after 2025-01-01


In [12]:
X_train = X[X.index < split_date]
y_train = y[y.index < split_date]
    
X_test = X[X.index >= split_date]
y_test = y[y.index >= split_date]
print(f"Train set size: {X_train.shape[0]} rows")
print(f"Test set size: {X_test.shape[0]} rows")

Train set size: 105024 rows
Test set size: 28512 rows


In [13]:
model_rfg = RandomForestRegressor(
        n_estimators=100,     
        max_depth=15,         
        n_jobs=-1,            
        random_state=42,      
        min_samples_leaf=5    
    )
model_rfg.fit(X_train, y_train)

In [14]:
y_pred_rfg = model_rfg.predict(X_test)

mae = mean_absolute_error(y_test,y_pred_rfg)
rmse = mean_squared_error(y_test, y_pred_rfg)
r2 = r2_score(y_test, y_pred_rfg)

print(f"Test Set MAE (Mean Absolute Error): {mae:.2f}")
print(f"Test Set RMSE (Root Mean Squared Error): {rmse:.2f}")
print(f"Test Set R-squared (R²): {r2:.3f}")

Test Set MAE (Mean Absolute Error): 2.08
Test Set RMSE (Root Mean Squared Error): 18.37
Test Set R-squared (R²): 0.982


In [15]:
model_lg = LinearRegression(n_jobs=-1)
model_lg.fit(X_train,y_train)

In [16]:
y_pred_lg = model_lg.predict(X_test)

mae = mean_absolute_error(y_test,y_pred_lg)
rmse = mean_squared_error(y_test, y_pred_lg)
r2 = r2_score(y_test, y_pred_lg)

print(f"Test Set MAE (Mean Absolute Error): {mae:.2f}")
print(f"Test Set RMSE (Root Mean Squared Error): {rmse:.2f}")
print(f"Test Set R-squared (R²): {r2:.3f}")

Test Set MAE (Mean Absolute Error): 2.15
Test Set RMSE (Root Mean Squared Error): 18.34
Test Set R-squared (R²): 0.982


In [17]:
model_dt = DecisionTreeRegressor(max_depth=15, random_state=42, min_samples_leaf=5)
model_dt.fit(X_train,y_train)

In [18]:
y_pred_dt = model_dt.predict(X_test)

mae = mean_absolute_error(y_test,y_pred_dt)
rmse = mean_squared_error(y_test, y_pred_dt)
r2 = r2_score(y_test, y_pred_dt)

print(f"Test Set MAE (Mean Absolute Error): {mae:.2f}")
print(f"Test Set RMSE (Root Mean Squared Error): {rmse:.2f}")
print(f"Test Set R-squared (R²): {r2:.3f}")

Test Set MAE (Mean Absolute Error): 2.71
Test Set RMSE (Root Mean Squared Error): 28.92
Test Set R-squared (R²): 0.972


In [19]:
test_city = 'chennai'
test_timestamp_str = '2025-03-31 18:00:00' 
test_timestamp = pd.Timestamp(test_timestamp_str)

In [20]:
if test_timestamp in X_test.index:
    X_single = X_test.loc[[test_timestamp]] 
    city_col_name = f'city_{test_city}'

    if city_col_name in X_single.columns:
        X_single = X_single[X_single[city_col_name] == 1]
    elif test_city == 'bangalore': 
        other_city_cols = [f'city_{c}' for c in ['mumbai', 'delhi', 'chennai']]
        is_bangalore = (X_single[other_city_cols] == 0).all(axis=1)
        X_single = X_single[is_bangalore]

    if not X_single.empty:
        y_actual_single = y_test.loc[test_timestamp]
        if isinstance(y_actual_single, pd.Series):
            y_actual_single = y_actual_single.iloc[0]
            X_input = X_single

        prediction_single = model_rfg.predict(X_input)[0]

        # Print the comparison
        print("\n--- Prediction Result ---")
        print(f"Actual PM2.5:    {y_actual_single:.2f} μg/m³")
        print(f"Predicted PM2.5: {prediction_single:.2f} μg/m³")
        print(f"Difference:      {abs(y_actual_single - prediction_single):.2f} μg/m³")

        # (You would add the classification logic here if needed)

    else:
        # This will print if the specific city wasn't found at that timestamp
        print(f"Error: Could not isolate features for {test_city} at the specified time after filtering.")

else:
    # This will print if the timestamp itself wasn't in the test set index
    print(f"Error: Timestamp {test_timestamp_str} not found in the test set index.")


--- Prediction Result ---
Actual PM2.5:    31.00 μg/m³
Predicted PM2.5: 40.63 μg/m³
Difference:      9.63 μg/m³


In [21]:
output_file ="Featured_Engineered_Dataset.csv" 
df.to_csv(output_file)

In [22]:
MODEL_FILE = 'regression_model.joblib'
print(f"\nSaving model to {MODEL_FILE}...")
joblib.dump(model_rfg, MODEL_FILE)
print(f"Model saved.")


Saving model to regression_model.joblib...
Model saved.
