In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('./data/cleaned_station_day_final.csv')

# Feature selection
features = data[[ 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO',
       'SO2', 'O3']]
target = data['AQI']

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Normalize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Model selection
model = RandomForestRegressor(random_state=42)

# Simplified hyperparameter tuning for quick testing
param_grid = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [2]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')

# Fit model and catch errors
try:
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_
    print("Grid search completed successfully.")
except Exception as e:
    print(f"Error during grid search: {e}")


Grid search completed successfully.


In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predictions
y_pred = best_model.predict(X_test_scaled)

# Evaluation metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse}, MAE: {mae}, R²: {r2}')

RMSE: 0.6375573998917158, MAE: 0.08082053914581165, R²: 0.9999578990082361




In [4]:
# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)
y_pred

array([ 59.87761586, 102.72437827,  35.6138386 , ..., 410.11031244,
       133.98626101, 184.31137264])

In [5]:
y_test

39252     60.032828
26494    102.724358
68361     35.244523
23615    398.150860
4906     102.724358
            ...    
35755     89.223020
84310    102.724358
2185     410.095638
53008    133.943838
309      184.303477
Name: AQI, Length: 21607, dtype: float64