In [20]:
import pandas as pd

df = pd.read_csv("../data/processed/delhi_aqi_cleaned.csv")
df.head()

df.info()

features = ['so2', 'no2', 'spm', 'year', 'month']
target = 'rspm'

X = df[features]
y = df[target]

X.isnull().sum(), y.isnull().sum()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

y_pred = lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("MAE:", mae)
print("RMSE:", rmse)

coef_df = pd.DataFrame({
    'Feature': ['so2', 'no2', 'spm', 'year', 'month'],
    'Coefficient': lr.coef_
}).sort_values(by='Coefficient', ascending=False)

coef_df

pd.DataFrame({
    'Actual RSPM': y_test.values[:10],
    'Predicted RSPM': y_pred[:10]
})

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print("Random Forest MAE:", rf_mae)
print("Random Forest RMSE:", rf_rmse)

comparison_df = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MAE": [mae, rf_mae],
    "RMSE": [rmse, rf_rmse]
})

comparison_df

feature_importance = pd.DataFrame({
    "Feature": features,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

feature_importance

import joblib

joblib.dump(rf, "../models/rf_rspm_model.pkl")
print("Random Forest model saved")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8551 entries, 0 to 8550
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   state     8551 non-null   object 
 1   location  8551 non-null   object 
 2   so2       8551 non-null   float64
 3   no2       8551 non-null   float64
 4   rspm      8551 non-null   float64
 5   spm       8551 non-null   float64
 6   pm2_5     371 non-null    float64
 7   date      8551 non-null   object 
 8   year      8551 non-null   int64  
 9   month     8551 non-null   int64  
 10  day       8551 non-null   int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 735.0+ KB
MAE: 61.76003705916312
RMSE: 83.4975111298339
Random Forest MAE: 46.376010750562266
Random Forest RMSE: 71.16027708542578
Random Forest model saved
