In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import glob
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

data_dir = "data/AirPollutionSeoul"
measurement_files = glob.glob(os.path.join(data_dir, "Measurement*.csv"))

all_data = []
for file in measurement_files:
    station_data = pd.read_csv(file)
    station_name = os.path.basename(file).replace("Measurement_", "").replace(".csv", "")
    station_data["Station"] = station_name
    all_data.append(station_data)

df = pd.concat(all_data, ignore_index=True)
df['Measurement date'] = pd.to_datetime(df['Measurement date'])
df = df.sort_values(by='Measurement date')
df = df.fillna(method='ffill').fillna(method='bfill')

pollutants = ['PM10', 'PM2.5', 'NO2', 'CO', 'SO2', 'O3']
features = ['Measurement date', 'Station'] + pollutants
df = df[features]

df['hour'] = df['Measurement date'].dt.hour
df['day'] = df['Measurement date'].dt.day
df['month'] = df['Measurement date'].dt.month
df['year'] = df['Measurement date'].dt.year
df['dayofweek'] = df['Measurement date'].dt.dayofweek

seq_len = 24
pred_len = 12
target_cols = pollutants

df_encoded = pd.get_dummies(df, columns=['Station'])
cols_to_scale = [col for col in df_encoded.columns if col != 'Measurement date']
scaler = StandardScaler()
df_encoded[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_len, pred_len, target_cols):
        self.data = data
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.target_cols = target_cols
        self.target_indices = [data.columns.get_loc(col) for col in target_cols]
        
    def __len__(self):
        return len(self.data) - self.seq_len - self.pred_len + 1
    
    def __getitem__(self, idx):
        x_start = idx
        x_end = idx + self.seq_len
        x = self.data.iloc[x_start:x_end].values
        
        y_start = x_end
        y_end = y_start + self.pred_len
        y = self.data.iloc[y_start:y_end, self.target_indices].values
        
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

data_array = df_encoded.drop(columns=['Measurement date'])

# Use the entire dataset instead of sampling
train_data, temp_data = train_test_split(data_array, test_size=0.3, shuffle=False)
val_data, test_data = train_test_split(temp_data, test_size=0.5, shuffle=False)

print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")

train_dataset = TimeSeriesDataset(train_data, seq_len, pred_len, target_cols)
val_dataset = TimeSeriesDataset(val_data, seq_len, pred_len, target_cols)
test_dataset = TimeSeriesDataset(test_data, seq_len, pred_len, target_cols)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

class TimeSeriesForecaster(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, seq_len, pred_len):
        super(TimeSeriesForecaster, self).__init__()
        self.input_embedding = nn.Linear(input_dim, hidden_dim)
        
        self.lstm = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=0.1,
            bidirectional=True  # Bidirectional LSTM for better feature extraction
        )
        
        # Account for bidirectional LSTM (hidden_dim * 2)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, pred_len * output_dim)
        
        self.dropout = nn.Dropout(0.1)
        self.activation = nn.ReLU()
        self.layer_norm = nn.LayerNorm(hidden_dim)  # Add layer normalization
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.output_dim = output_dim
        
    def forward(self, x):
        batch_size = x.shape[0]
        x = self.input_embedding(x)
        
        x, _ = self.lstm(x)
        
        # Use the last time step from both directions
        x = x[:, -1, :]
        
        x = self.fc1(x)
        x = self.layer_norm(x)  # Apply layer normalization
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        output = x.reshape(batch_size, self.pred_len, self.output_dim)
        return output

input_dim = data_array.shape[1]
hidden_dim = 128  # Increased hidden dim for larger dataset
output_dim = len(target_cols)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

model = TimeSeriesForecaster(input_dim, hidden_dim, output_dim, seq_len, pred_len).to(device)
criterion = nn.MSELoss()

# Learning rate scheduler for better convergence
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

epochs = 40  # Increased epochs for the full dataset
best_val_loss = float('inf')

train_losses = []
val_losses = []

for epoch in range(epochs):
    model.train()
    train_loss = 0
    train_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
    
    for batch_idx, (data, target) in enumerate(train_iter):
        data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
        
        optimizer.zero_grad(set_to_none=True)  # More efficient than zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        train_iter.set_postfix({"loss": loss.item()})
        
        # Optional: gradient accumulation for large datasets
        # if batch_idx % 4 == 0:  # Accumulate gradients for 4 batches
        #     optimizer.step()
        #     optimizer.zero_grad(set_to_none=True)
    
    train_loss /= len(train_loader)
    train_losses.append(train_loss)
    
    model.eval()
    val_loss = 0
    val_iter = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Valid]")
    
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_iter):
            data, target = data.to(device, non_blocking=True), target.to(device, non_blocking=True)
            
            output = model(data)
            loss = criterion(output, target)
            
            val_loss += loss.item()
            val_iter.set_postfix({"loss": loss.item()})
    
    val_loss /= len(val_loader)
    val_losses.append(val_loss)
    
    # Use learning rate scheduler
    scheduler.step(val_loss)
    current_lr = optimizer.param_groups[0]['lr']
    
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, LR: {current_lr:.6f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_seoul_model.pt")
        print(f"Saved model with validation loss: {val_loss:.4f}")

plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_losses, 'b-', label='Training Loss')
plt.plot(range(1, epochs + 1), val_losses, 'r-', label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.savefig('loss_curves.png')
plt.show()

model.load_state_dict(torch.load("best_seoul_model.pt"))
model.eval()

test_loss = 0
all_preds = []
all_targets = []
all_inputs = []

with torch.no_grad():
    for batch_idx, (data, target) in enumerate(test_loader):
        data, target = data.to(device), target.to(device)
        
        output = model(data)
        loss = criterion(output, target)
        test_loss += loss.item()
        
        all_preds.append(output.cpu().numpy())
        all_targets.append(target.cpu().numpy())
        all_inputs.append(data.cpu().numpy())

test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")

all_preds = np.concatenate(all_preds, axis=0)
all_targets = np.concatenate(all_targets, axis=0)
all_inputs = np.concatenate(all_inputs, axis=0)

metrics_per_hour = []
for hour in range(pred_len):
    hour_metrics = []
    for i, pollutant in enumerate(target_cols):
        mae = mean_absolute_error(all_targets[:, hour, i], all_preds[:, hour, i])
        rmse = np.sqrt(mean_squared_error(all_targets[:, hour, i], all_preds[:, hour, i]))
        r2 = r2_score(all_targets[:, hour, i], all_preds[:, hour, i])
        hour_metrics.append({
            'Hour': hour + 1,
            'Pollutant': pollutant,
            'MAE': mae,
            'RMSE': rmse,
            'R2': r2
        })
    metrics_per_hour.extend(hour_metrics)

hourly_metrics_df = pd.DataFrame(metrics_per_hour)
print("\nHourly Prediction Metrics:")
print(hourly_metrics_df.head(12))

pivot_df = hourly_metrics_df.pivot_table(index='Hour', columns='Pollutant', values='RMSE')
plt.figure(figsize=(14, 8))
for pollutant in pivot_df.columns:
    plt.plot(pivot_df.index, pivot_df[pollutant], marker='o', label=pollutant)
plt.xlabel('Prediction Hour')
plt.ylabel('RMSE')
plt.title('RMSE by Hour for Each Pollutant')
plt.legend()
plt.grid(True)
plt.xticks(range(1, pred_len + 1))
plt.savefig('rmse_by_hour.png')
plt.show()

num_samples = 5
fig, axes = plt.subplots(len(target_cols), num_samples, figsize=(20, 15), sharex=True)

for i, pollutant in enumerate(target_cols):
    sample_indices = np.random.choice(len(all_preds), num_samples, replace=False)
    
    for j, idx in enumerate(sample_indices):
        ax = axes[i, j]
        ax.plot(range(pred_len), all_preds[idx, :, i], 'r-', linewidth=2, label='Predicted')
        ax.plot(range(pred_len), all_targets[idx, :, i], 'b-', linewidth=2, label='Actual')
        ax.set_title(f'{pollutant} - Sample {j+1}')
        ax.grid(True)
        
        if i == len(target_cols) - 1:
            ax.set_xlabel('Hours Ahead')
        
        if j == 0:
            ax.set_ylabel(f'{pollutant} Level')
            
        if i == 0 and j == 0:
            ax.legend()

plt.tight_layout()
plt.savefig('prediction_samples.png')
plt.show()

overall_metrics = []
for i, pollutant in enumerate(target_cols):
    mae = mean_absolute_error(all_targets[:, :, i].flatten(), all_preds[:, :, i].flatten())
    rmse = np.sqrt(mean_squared_error(all_targets[:, :, i].flatten(), all_preds[:, :, i].flatten()))
    r2 = r2_score(all_targets[:, :, i].flatten(), all_preds[:, :, i].flatten())
    
    overall_metrics.append({
        'Pollutant': pollutant,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2
    })

metrics_df = pd.DataFrame(overall_metrics)
print("\nOverall Metrics by Pollutant:")
print(metrics_df)

plt.figure(figsize=(12, 8))
metrics_df.plot(x='Pollutant', y=['MAE', 'RMSE'], kind='bar', ax=plt.gca())
plt.title('MAE and RMSE by Pollutant')
plt.ylabel('Error Value')
plt.grid(axis='y')
plt.savefig('error_by_pollutant.png')
plt.show()

plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Pollutant'], metrics_df['R2'])
plt.title('R² Score by Pollutant')
plt.ylabel('R² Score')
plt.ylim(0, 1)
plt.grid(axis='y')
plt.savefig('r2_by_pollutant.png')
plt.show()

import joblib
joblib.dump(scaler, "seoul_air_scaler.pkl")

sample_input = next(iter(test_loader))[0][0].unsqueeze(0)
torch.save(sample_input, "sample_input.pt")