In [11]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os

# --- Paths ---
DATASET_PATH = "/content/sample_data/main.csv"
MODEL_SAVE_PATH = "mlp_pollutants_model.pth"

# --- Load Dataset ---
if not os.path.exists(DATASET_PATH):
    raise FileNotFoundError(f"Dataset file not found at: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH)

# --- Features ---
weather_features = [
    "temperature_2m (°C)",
    "relative_humidity_2m (%)",
    "rain (mm)",
    "wind_speed_100m (km/h)",
    "wind_direction_100m (°)",
    "pressure_msl (hPa)",
    "surface_pressure (hPa)"
]

pollutant_features = ["co", "no", "no2", "o3", "so2", "pm2_5", "pm10", "nh3"]

X = df[weather_features].values
y = df[pollutant_features].values

# --- Scaling ---
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
X_scaled = x_scaler.fit_transform(X)
y_scaled = y_scaler.fit_transform(y)

# --- Dataset class ---
class WeatherPollutantDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# --- Train/Test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.2, random_state=42
)

train_dataset = WeatherPollutantDataset(X_train, y_train)
test_dataset = WeatherPollutantDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# --- Improved Model ---
class ImprovedMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ImprovedMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.bn3 = nn.BatchNorm1d(hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()

    def forward(self, x):
        x1 = self.relu(self.bn1(self.fc1(x)))
        x1 = self.dropout(x1)
        x2 = self.relu(self.bn2(self.fc2(x1))) + x1  # residual connection
        x2 = self.dropout(x2)
        x3 = self.relu(self.bn3(self.fc3(x2))) + x2  # residual connection
        x3 = self.dropout(x3)
        out = self.fc_out(x3)
        return out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ImprovedMLP(input_dim=len(weather_features), hidden_dim=128, output_dim=len(pollutant_features)).to(device)

# --- Automatic target weighting based on variance ---
target_variance = torch.tensor(np.var(y_scaled, axis=0), dtype=torch.float32, device=device)
weights = 1.0 / (target_variance + 1e-6)  # smaller variance gets higher weight
weights = weights / weights.sum() * len(pollutant_features)  # normalize

def weighted_mse(pred, target, weights):
    return torch.mean(weights * (pred - target) ** 2)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

# --- Training ---
epochs = 100
patience = 15
best_loss = float("inf")
patience_counter = 0

print("Starting training...")
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = weighted_mse(outputs, y_batch, weights)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    scheduler.step(avg_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.5f}")

    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

print("Training complete. Best model saved.")

# --- Inference ---
def predict_pollutants(model, weather_input):
    model.eval()
    weather_tensor = torch.tensor(weather_input, dtype=torch.float32).to(device)
    with torch.no_grad():
        preds = model(weather_tensor).cpu().numpy()
    return y_scaler.inverse_transform(preds)

# --- Evaluation ---
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()
all_preds, all_targets = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        preds = model(X_batch).cpu().numpy()
        targets = y_batch.cpu().numpy()
        all_preds.append(preds)
        all_targets.append(targets)

all_preds = np.vstack(all_preds)
all_targets = np.vstack(all_targets)
all_preds_inv = y_scaler.inverse_transform(all_preds)
all_targets_inv = y_scaler.inverse_transform(all_targets)

mse = mean_squared_error(all_targets_inv, all_preds_inv)
mae = mean_absolute_error(all_targets_inv, all_preds_inv)
r2 = r2_score(all_targets_inv, all_preds_inv, multioutput="raw_values")

print("\n--- Test Metrics ---")
print(f"MSE: {mse:.3f}, MAE: {mae:.3f}")
for i, p in enumerate(pollutant_features):
    print(f"{p} R²: {r2[i]:.3f}")

# --- Example Predictions ---
example_weather = X_test[:5]
predicted_pollutants = predict_pollutants(model, example_weather)
for i, p in enumerate(predicted_pollutants):
    print(f"Sample {i+1} Predicted pollutants: {p}")


Starting training...
Epoch 1/100, Loss: 0.16491
Epoch 2/100, Loss: 0.02336
Epoch 3/100, Loss: 0.01140
Epoch 4/100, Loss: 0.00968
Epoch 5/100, Loss: 0.00950
Epoch 6/100, Loss: 0.00944
Epoch 7/100, Loss: 0.00934
Epoch 8/100, Loss: 0.00947
Epoch 9/100, Loss: 0.00937
Epoch 10/100, Loss: 0.00941
Epoch 11/100, Loss: 0.00948
Epoch 12/100, Loss: 0.00936
Epoch 13/100, Loss: 0.00931
Epoch 14/100, Loss: 0.00926
Epoch 15/100, Loss: 0.00925
Epoch 16/100, Loss: 0.00911
Epoch 17/100, Loss: 0.00905
Epoch 18/100, Loss: 0.00906
Epoch 19/100, Loss: 0.00904
Epoch 20/100, Loss: 0.00903
Epoch 21/100, Loss: 0.00897
Epoch 22/100, Loss: 0.00885
Epoch 23/100, Loss: 0.00887
Epoch 24/100, Loss: 0.00884
Epoch 25/100, Loss: 0.00883
Epoch 26/100, Loss: 0.00867
Epoch 27/100, Loss: 0.00863
Epoch 28/100, Loss: 0.00859
Epoch 29/100, Loss: 0.00841
Epoch 30/100, Loss: 0.00838
Epoch 31/100, Loss: 0.00837
Epoch 32/100, Loss: 0.00837
Epoch 33/100, Loss: 0.00834
Epoch 34/100, Loss: 0.00826
Epoch 35/100, Loss: 0.00827
Epoch 36

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print("\n--- MSE and MAE per pollutant ---")
for i, pollutant in enumerate(pollutant_features):
    mse_i = mean_squared_error(all_targets_inv[:, i], all_preds_inv[:, i])
    mae_i = mean_absolute_error(all_targets_inv[:, i], all_preds_inv[:, i])
    print(f"{pollutant}: MSE = {mse_i:.3f}, MAE = {mae_i:.3f}")



--- MSE and MAE per pollutant ---
co: MSE = 4289033.500, MAE = 1363.769
no: MSE = 2127.071, MAE = 27.503
no2: MSE = 1321.070, MAE = 24.853
o3: MSE = 5219.814, MAE = 50.384
so2: MSE = 1576.203, MAE = 26.139
pm2_5: MSE = 23385.590, MAE = 100.982
pm10: MSE = 34485.324, MAE = 125.989
nh3: MSE = 431.672, MAE = 12.047


In [15]:
print("\n--- Relative Accuracy (%) per pollutant (safe) ---")
for i, pollutant in enumerate(pollutant_features):
    y_true = all_targets_inv[:, i]
    y_pred = all_preds_inv[:, i]
    # divide by mean of true values instead of individual values
    acc = 100 * (1 - np.mean(np.abs(y_true - y_pred)) / (np.mean(y_true) + 1e-6))
    print(f"{pollutant}: {acc:.2f}%")



--- Relative Accuracy (%) per pollutant (safe) ---
co: 52.78%
no: 16.20%
no2: 61.89%
o3: 17.50%
so2: 60.46%
pm2_5: 57.01%
pm10: 57.45%
nh3: 51.87%


In [17]:
import joblib

# Save scalers
joblib.dump(x_scaler, "x_scaler.pkl")
joblib.dump(y_scaler, "y_scaler.pkl")

print("Feature and target scalers saved successfully.")


Feature and target scalers saved successfully.
