In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('./data/cleaned_city_day_final.csv')

# Feature selection
features = data[[ 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO',
       'SO2', 'O3']]
target = data['AQI']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Normalize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
data.columns

Index(['City', 'Datetime', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO',
       'SO2', 'O3', 'AQI', 'AQI_Category'],
      dtype='object')

In [27]:
print(X_train_scaled.shape)
print(y_train.shape)


(23624, 9)
(23624,)


In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Model selection
model = RandomForestRegressor(random_state=42)

# Simplified hyperparameter tuning for quick testing
param_grid = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [2]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')

# Fit model and catch errors
try:
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_
    print("Grid search completed successfully.")
except Exception as e:
    print(f"Error during grid search: {e}")


Grid search completed successfully.


In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predictions
y_pred = best_model.predict(X_test_scaled)

# Evaluation metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}, MAE: {mae}, R²: {r2}')

RMSE: 1.0589056898780542, MAE: 0.10823739062100828, R²: 0.9999355580952968




In [31]:
# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)
y_pred

array([ 82.71746452,  82.71746452,  82.71746452, ...,  65.42869936,
       424.80183749,  73.97021666])

In [40]:
print(X_test_scaled)

[[ 0.58430354  0.51906027  0.86775849 ... -1.34876115  0.74861593
   0.6373227 ]
 [ 0.58430354  0.51906027  0.86775849 ... -1.34876115  0.74861593
   0.6373227 ]
 [ 0.23895208  0.51906027 -0.14932202 ... -0.70564897 -1.71426542
  -0.43724923]
 ...
 [ 0.14323243 -0.59030731 -0.94865761 ... -0.63419207 -0.77091294
  -0.28601821]
 [-0.4971681  -1.09480022 -1.18111745 ... -0.93192918 -1.48965769
  -0.02264555]
 [ 1.96470935 -0.04494644 -0.04685645 ... -1.32494218 -1.26130649
  -1.30554596]]


In [41]:
y_test
82.71746452,

22593     82.714287
22459     82.714287
24213     82.714287
25301     82.714287
20886     82.714287
            ...    
17614     83.785455
6198     383.737495
9648      65.506970
16507    424.986795
7395      73.966021
Name: AQI, Length: 5907, dtype: float64

In [32]:
y_pred.shape

(5907,)

In [35]:
X_train

(23624, 9)