In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Dublin

In [None]:
!pip install -q torch numpy pandas scikit-learn matplotlib tqdm joblib

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import glob
import time

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True

data_path = "data/airview_dublincity_roaddata_ugm3.csv"

if not os.path.exists(data_path):
    os.makedirs(os.path.dirname(data_path), exist_ok=True)
    
    print(f"Data file not found at {data_path}.")
    print("Please upload the data file or provide the correct path.")
    print("For now, creating a mock dataset for demonstration purposes...")
    
    from datetime import datetime, timedelta
    
    start_date = datetime(2021, 1, 1)
    n_samples = 8760
    dates = [start_date + timedelta(hours=i) for i in range(n_samples)]
    
    pollutants = ['NO2', 'PM2.5', 'PM10', 'BC', 'UFP']
    mock_data = {
        'timestamp': dates
    }
    
    for pollutant in pollutants:
        base = np.random.normal(30, 10, n_samples)
        seasonal = 15 * np.sin(np.linspace(0, 4*np.pi, n_samples))
        trend = np.linspace(0, 10, n_samples)
        noise = np.random.normal(0, 5, n_samples)
        values = base + seasonal + trend + noise
        values = np.maximum(0, values)
        mock_data[pollutant] = values
    
    df = pd.DataFrame(mock_data)
    df.to_csv(data_path, index=False)
    print(f"Mock dataset created and saved to {data_path}")

print(f"Loading data from {data_path}")
df = pd.read_csv(data_path)

print("\nDataset Information:")
print(f"Shape: {df.shape}")
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nSummary statistics:")
print(df.describe())

missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values)

timestamp_col = None
for col in df.columns:
    if 'time' in col.lower() or 'date' in col.lower():
        timestamp_col = col
        break

if timestamp_col is None:
    timestamp_col = df.columns[0]
    print(f"\nAssuming '{timestamp_col}' is the timestamp column")
else:
    print(f"\nIdentified '{timestamp_col}' as the timestamp column")

df[timestamp_col] = pd.to_datetime(df[timestamp_col])
df = df.sort_values(by=timestamp_col)

numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
if timestamp_col in numeric_cols:
    numeric_cols.remove(timestamp_col)

print(f"\nIdentified {len(numeric_cols)} numeric columns as potential pollutants/features:")
print(numeric_cols)

if df.isnull().sum().sum() > 0:
    print("\nFilling missing values...")
    df = df.fillna(method='ffill').fillna(method='bfill')
    print(f"Missing values after filling: {df.isnull().sum().sum()}")

print("\nCreating enhanced time-based features...")
df['hour_sin'] = np.sin(2 * np.pi * df[timestamp_col].dt.hour / 24)
df['hour_cos'] = np.cos(2 * np.pi * df[timestamp_col].dt.hour / 24)
df['day_sin'] = np.sin(2 * np.pi * df[timestamp_col].dt.day / 31)
df['day_cos'] = np.cos(2 * np.pi * df[timestamp_col].dt.day / 31)
df['month_sin'] = np.sin(2 * np.pi * df[timestamp_col].dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df[timestamp_col].dt.month / 12)
df['dayofweek_sin'] = np.sin(2 * np.pi * df[timestamp_col].dt.dayofweek / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df[timestamp_col].dt.dayofweek / 7)

print("Adding lag features...")
for col in numeric_cols[:5]:
    for lag in [1, 3, 6, 12, 24]:
        lag_col = f"{col}_lag_{lag}"
        df[lag_col] = df[col].shift(lag)

df = df.fillna(method='ffill').fillna(method='bfill')

seq_len = 24
pred_len = 12
target_cols = numeric_cols[:5]
print(f"\nUsing {target_cols} as target columns")

cols_to_scale = [col for col in df.columns if col != timestamp_col and pd.api.types.is_numeric_dtype(df[col])]
scaler = StandardScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

joblib.dump(scaler, "dublin_air_scaler.pkl")
print("Scaler saved to dublin_air_scaler.pkl")

class AirQualityDataset(Dataset):
    def __init__(self, data, seq_len, pred_len, target_indices):
        self.data = data.values
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.target_indices = target_indices
        
    def __len__(self):
        return len(self.data) - self.seq_len - self.pred_len + 1
    
    def __getitem__(self, idx):
        x_start = idx
        x_end = idx + self.seq_len
        x = self.data[x_start:x_end]
        
        y_start = x_end
        y_end = y_start + self.pred_len
        y = self.data[y_start:y_end][:, self.target_indices]
        
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

data_array = df[cols_to_scale]
target_indices = [cols_to_scale.index(col) for col in target_cols]

train_data, temp_data = train_test_split(data_array, test_size=0.3, shuffle=False)
val_data, test_data = train_test_split(temp_data, test_size=0.5, shuffle=False)

print(f"\nTraining data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")

print(f"\nCreating datasets with seq_len={seq_len}, pred_len={pred_len}")
print(f"Total data length: {len(data_array)}")

expected_train_samples = len(train_data) - seq_len - pred_len + 1
expected_val_samples = len(val_data) - seq_len - pred_len + 1
expected_test_samples = len(test_data) - seq_len - pred_len + 1

print(f"Expected training samples: {expected_train_samples}")
print(f"Expected validation samples: {expected_val_samples}")
print(f"Expected test samples: {expected_test_samples}")

train_dataset = AirQualityDataset(train_data, seq_len, pred_len, target_indices)
val_dataset = AirQualityDataset(val_data, seq_len, pred_len, target_indices)
test_dataset = AirQualityDataset(test_data, seq_len, pred_len, target_indices)

print(f"Actual training samples: {len(train_dataset)}")
print(f"Actual validation samples: {len(val_dataset)}")
print(f"Actual test samples: {len(test_dataset)}")

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

print(f"\nNumber of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")
print(f"Number of test batches: {len(test_loader)}")

print("\nChecking data loader...")
sample_x, sample_y = next(iter(train_loader))
print(f"Sample input shape: {sample_x.shape}, Sample target shape: {sample_y.shape}")

class TimeSeriesForecaster(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, seq_len, pred_len):
        super(TimeSeriesForecaster, self).__init__()
        self.input_embedding = nn.Linear(input_dim, hidden_dim)
        
        self.lstm = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=hidden_dim,
            num_layers=3,
            batch_first=True,
            dropout=0.2,
            bidirectional=True
        )
        
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 1)
        )
        
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.layer_norm1 = nn.LayerNorm(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.layer_norm2 = nn.LayerNorm(hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, pred_len * output_dim)
        
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.output_dim = output_dim
    
    def forward(self, x):
        batch_size = x.shape[0]
        
        x = self.input_embedding(x)
        
        lstm_out, _ = self.lstm(x)
        
        attn_weights = self.attention(lstm_out).squeeze(-1)
        attn_weights = torch.softmax(attn_weights, dim=1).unsqueeze(2)
        
        context = torch.sum(lstm_out * attn_weights, dim=1)
        
        x = self.fc1(context)
        x = self.layer_norm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = self.layer_norm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc3(x)
        
        output = x.reshape(batch_size, self.pred_len, self.output_dim)
        return output

input_dim = sample_x.shape[2]
hidden_dim = 512
output_dim = len(target_cols)

model = TimeSeriesForecaster(input_dim, hidden_dim, output_dim, seq_len, pred_len).to(device)
print(f"\nModel created with input_dim={input_dim}, hidden_dim={hidden_dim}, output_dim={output_dim}")

class WeightedMSELoss(nn.Module):
    def __init__(self, alpha=1.5):
        super(WeightedMSELoss, self).__init__()
        self.alpha = alpha
        self.mse = nn.MSELoss(reduction='none')
        
    def forward(self, pred, target):
        mse_loss = self.mse(pred, target)
        
        batch_size, seq_len, feat_dim = pred.shape
        time_weights = torch.linspace(1.0, self.alpha, seq_len).view(1, -1, 1).to(pred.device)
        time_weights = time_weights.expand(batch_size, seq_len, feat_dim)
        
        weighted_loss = (mse_loss * time_weights).mean()
        return weighted_loss

criterion = WeightedMSELoss(alpha=2.0)

print("\nInitializing optimizer and verifying parameters...")
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params:,}")

param_count = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        param_count += 1
        if param_count <= 5:
            print(f"  {name}: {param.shape}")

if param_count == 0:
    print("WARNING: No trainable parameters found!")

optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=3, verbose=True, min_lr=1e-6)

if len(optimizer.param_groups[0]['params']) == 0:
    print("WARNING: Optimizer has no parameters!")
else:
    print(f"Optimizer initialized with {len(optimizer.param_groups[0]['params'])} parameter groups")

epochs = 50
best_val_loss = float('inf')
patience = 8
counter = 0
train_losses = []
val_losses = []

print("\nStarting training with detailed monitoring...")
for epoch in range(epochs):
    model.train()
    train_loss = 0
    train_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
    batch_count = 0
    
    for batch_idx, (data, target) in enumerate(train_iter):
        data, target = data.to(device), target.to(device)
        
        if batch_idx == 0:
            print(f"Batch data shape: {data.shape}, target shape: {target.shape}")
        
        optimizer.zero_grad(set_to_none=True)
        
        output = model(data)
        
        loss = criterion(output, target)
        current_loss = loss.item()
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        train_loss += current_loss
        batch_count += 1
        
        train_iter.set_postfix({
            "loss": current_loss,
            "avg_loss": train_loss / (batch_idx + 1),
            "batch": f"{batch_idx+1}/{len(train_loader)}"
        })
        
        if batch_idx % 10 == 0:
            print(f"  Batch {batch_idx}/{len(train_loader)}: Loss = {current_loss:.6f}")
    
    train_loss /= batch_count
    train_losses.append(train_loss)
    
    model.eval()
    val_loss = 0
    val_batch_count = 0
    val_iter = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Valid]")
    
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_iter):
            data, target = data.to(device), target.to(device)
            
            output = model(data)
            loss = criterion(output, target)
            current_loss = loss.item()
            
            val_loss += current_loss
            val_batch_count += 1
            
            val_iter.set_postfix({
                "loss": current_loss,
                "avg_loss": val_loss / (batch_idx + 1),
                "batch": f"{batch_idx+1}/{len(val_loader)}"
            })
    
    val_loss /= val_batch_count
    val_losses.append(val_loss)
    
    scheduler.step(val_loss)
    current_lr = optimizer.param_groups[0]['lr']
    
    print(f"Epoch {epoch+1}/{epochs} Summary:")
    print(f"  Train Loss: {train_loss:.6f} ({batch_count} batches)")
    print(f"  Val Loss: {val_loss:.6f} ({val_batch_count} batches)")
    print(f"  Learning Rate: {current_lr:.6f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_dublin_model.pt")
        print(f"  Saved model with validation loss: {val_loss:.6f}")
        counter = 0
    else:
        counter += 1
        print(f"  No improvement for {counter} epochs (best val loss: {best_val_loss:.6f})")
        if counter >= patience:
            print(f"Early stopping after {epoch+1} epochs")
            break

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(train_losses) + 1), train_losses, 'b-', label='Training Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, 'r-', label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.savefig('dublin_loss_curves.png')
plt.show()

model.load_state_dict(torch.load("best_dublin_model.pt"))
model.eval()

print("\nEvaluating model on test data...")
test_loss = 0
all_preds = []
all_targets = []
all_inputs = []

batch_times = []
inference_times = []

with torch.no_grad():
    for batch_idx, (data, target) in enumerate(tqdm(test_loader, desc="Testing")):
        start_time = time.time()
        data, target = data.to(device), target.to(device)
        
        forward_start = time.time()
        output = model(data)
        inference_time = time.time() - forward_start
        inference_times.append(inference_time)
        
        loss = criterion(output, target)
        test_loss += loss.item()
        
        all_preds.append(output.cpu().numpy())
        all_targets.append(target.cpu().numpy())
        all_inputs.append(data.cpu().numpy())
        
        batch_time = time.time() - start_time
        batch_times.append(batch_time)
        
        if batch_idx % 10 == 0:
            print(f"  Batch {batch_idx}: Loss={loss.item():.6f}, Time={batch_time:.4f}s, Inference={inference_time:.4f}s")

test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.6f}")
print(f"Average batch processing time: {np.mean(batch_times):.4f}s")
print(f"Average inference time: {np.mean(inference_times):.4f}s")

all_preds = np.concatenate(all_preds, axis=0)
all_targets = np.concatenate(all_targets, axis=0)
all_inputs = np.concatenate(all_inputs, axis=0)

original_target_indices = [cols_to_scale.index(col) for col in target_cols]

unscaled_preds = np.zeros_like(all_preds)
unscaled_targets = np.zeros_like(all_targets)

for i, idx in enumerate(original_target_indices):
    for h in range(pred_len):
        temp_pred = np.zeros((all_preds.shape[0], len(cols_to_scale)))
        temp_pred[:, idx] = all_preds[:, h, i]
        
        unscaled_temp = scaler.inverse_transform(temp_pred)
        unscaled_preds[:, h, i] = unscaled_temp[:, idx]
        
        temp_target = np.zeros((all_targets.shape[0], len(cols_to_scale)))
        temp_target[:, idx] = all_targets[:, h, i]
        
        unscaled_temp = scaler.inverse_transform(temp_target)
        unscaled_targets[:, h, i] = unscaled_temp[:, idx]

print("\nCalculating evaluation metrics...")
metrics_per_hour = []
for hour in range(pred_len):
    hour_metrics = []
    for i, pollutant in enumerate(target_cols):
        mae = mean_absolute_error(unscaled_targets[:, hour, i], unscaled_preds[:, hour, i])
        rmse = np.sqrt(mean_squared_error(unscaled_targets[:, hour, i], unscaled_preds[:, hour, i]))
        r2 = r2_score(unscaled_targets[:, hour, i], unscaled_preds[:, hour, i])
        hour_metrics.append({
            'Hour': hour + 1,
            'Pollutant': pollutant,
            'MAE': mae,
            'RMSE': rmse,
            'R2': r2
        })
    metrics_per_hour.extend(hour_metrics)

hourly_metrics_df = pd.DataFrame(metrics_per_hour)
print("\nHourly Prediction Metrics (first 12 rows):")
print(hourly_metrics_df.head(12))

pivot_df = hourly_metrics_df.pivot_table(index='Hour', columns='Pollutant', values='RMSE')
plt.figure(figsize=(14, 8))
for pollutant in pivot_df.columns:
    plt.plot(pivot_df.index, pivot_df[pollutant], marker='o', label=pollutant)
plt.xlabel('Prediction Hour')
plt.ylabel('RMSE')
plt.title('RMSE by Hour for Each Pollutant')
plt.legend()
plt.grid(True)
plt.xticks(range(1, pred_len + 1))
plt.savefig('dublin_rmse_by_hour.png')
plt.show()

num_samples = 3
fig, axes = plt.subplots(len(target_cols), num_samples, figsize=(18, 12), sharex=True)

for i, pollutant in enumerate(target_cols):
    sample_indices = np.random.choice(len(unscaled_preds), num_samples, replace=False)
    
    for j, idx in enumerate(sample_indices):
        ax = axes[i, j]
        ax.plot(range(pred_len), unscaled_preds[idx, :, i], 'r-', linewidth=2, label='Predicted')
        ax.plot(range(pred_len), unscaled_targets[idx, :, i], 'b-', linewidth=2, label='Actual')
        ax.set_title(f'{pollutant} - Sample {j+1}')
        ax.grid(True)
        
        if i == len(target_cols) - 1:
            ax.set_xlabel('Hours Ahead')
        
        if j == 0:
            ax.set_ylabel(f'{pollutant} Level')
            
        if i == 0 and j == 0:
            ax.legend()

plt.tight_layout()
plt.savefig('dublin_prediction_samples.png')
plt.show()

overall_metrics = []
for i, pollutant in enumerate(target_cols):
    mae = mean_absolute_error(unscaled_targets[:, :, i].flatten(), unscaled_preds[:, :, i].flatten())
    rmse = np.sqrt(mean_squared_error(unscaled_targets[:, :, i].flatten(), unscaled_preds[:, :, i].flatten()))
    r2 = r2_score(unscaled_targets[:, :, i].flatten(), unscaled_preds[:, :, i].flatten())
    
    overall_metrics.append({
        'Pollutant': pollutant,
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2
    })

metrics_df = pd.DataFrame(overall_metrics)
print("\nOverall Metrics by Pollutant:")
print(metrics_df)

plt.figure(figsize=(12, 8))
metrics_df.plot(x='Pollutant', y=['MAE', 'RMSE'], kind='bar', ax=plt.gca())
plt.title('MAE and RMSE by Pollutant')
plt.ylabel('Error Value')
plt.grid(axis='y')
plt.savefig('dublin_error_by_pollutant.png')
plt.show()

plt.figure(figsize=(10, 6))
plt.bar(metrics_df['Pollutant'], metrics_df['R2'])
plt.title('R² Score by Pollutant')
plt.ylabel('R² Score')
plt.ylim(0, 1)
plt.grid(axis='y')
plt.savefig('dublin_r2_by_pollutant.png')
plt.show()

sample_input = next(iter(test_loader))[0][0].unsqueeze(0)
torch.save(sample_input, "dublin_sample_input.pt")

print("\nSummary of Results:")
print(f"Test Loss: {test_loss:.4f}")
print("\nAverage Metrics Across All Pollutants:")
print(f"Average MAE: {metrics_df['MAE'].mean():.4f}")
print(f"Average RMSE: {metrics_df['RMSE'].mean():.4f}")
print(f"Average R²: {metrics_df['R2'].mean():.4f}")

print("\nTraining and evaluation completed successfully!")