In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Seoul

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import glob
# from google.colab import drive
# drive.mount('/content/drive')

torch.manual_seed(42)
np.random.seed(42)

DATA_PATH = "/content/drive/MyDrive/data/AirPollutionSeoul"
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
SEQUENCE_LENGTH = 24
HIDDEN_SIZE = 64
NUM_LAYERS = 2
DROPOUT = 0.2

class AirQualityDataset(Dataset):
    def __init__(self, features, targets, seq_length=SEQUENCE_LENGTH):
        self.features = features
        self.targets = targets
        self.seq_length = seq_length

    def __len__(self):
        return len(self.features) - self.seq_length

    def __getitem__(self, idx):
        x = self.features[idx:idx + self.seq_length]
        y = self.targets[idx + self.seq_length]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

def load_and_preprocess_seoul_data(data_dir):
    try:
        print(f"Looking for CSV files in {data_dir}")
        csv_files = glob.glob(os.path.join(data_dir, '*.csv'))

        if not csv_files:
            raise FileNotFoundError(f"No CSV files found in {data_dir}")

        print(f"Found {len(csv_files)} CSV files: {[os.path.basename(f) for f in csv_files]}")

        dfs = []
        for file in csv_files:
            try:
                print(f"Loading {os.path.basename(file)}...")
                temp_df = pd.read_csv(file)
                dfs.append(temp_df)
                print(f"  Shape: {temp_df.shape}")
            except Exception as e:
                print(f"  Error loading {file}: {e}")

        print("Concatenating dataframes...")
        df = pd.concat(dfs, ignore_index=True)
        print(f"Combined data shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
        print(df.head())

        if df.duplicated().sum() > 0:
            print(f"Removing {df.duplicated().sum()} duplicate rows")
            df = df.drop_duplicates()

        print("Looking for date/time columns...")
        datetime_cols = [col for col in df.columns if any(time_kw in col.lower() for time_kw in ['date', 'time', 'hour', 'day'])]

        if datetime_cols:
            print(f"Found datetime columns: {datetime_cols}")
            if len(datetime_cols) == 1:
                df['timestamp'] = pd.to_datetime(df[datetime_cols[0]])
            else:
                try:
                    date_col = next(col for col in datetime_cols if 'date' in col.lower())
                    time_col = next(col for col in datetime_cols if 'time' in col.lower() or 'hour' in col.lower())
                    df['timestamp'] = pd.to_datetime(df[date_col] + ' ' + df[time_col])
                except:
                    print("Couldn't combine date and time columns")
                    df['timestamp'] = pd.to_datetime(df[datetime_cols[0]])

            print("Sorting data by timestamp...")
            df = df.sort_values('timestamp')

        print("Handling missing values...")
        missing_stats = df.isnull().sum()
        print(f"Missing values per column:\n{missing_stats[missing_stats > 0]}")

        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = [col for col in df.columns if col not in numeric_cols]
        print(f"Numeric columns: {len(numeric_cols)}")
        print(f"Categorical columns: {len(categorical_cols)}")

        for col in numeric_cols:
            if df[col].isnull().sum() > 0:
                print(f"  Filling NaN in {col} with median")
                df[col] = df[col].fillna(df[col].median())

        df = df.dropna(subset=numeric_cols)

        target_options = ['PM10', 'PM2.5', 'PM25', 'PM2_5', 'NO2', 'SO2', 'O3', 'CO']
        available_targets = [col for col in df.columns if any(target in col for target in target_options)]

        if not available_targets:
            print("No standard air quality target found. Using last numeric column.")
            target_column = numeric_cols[-1]
        else:
            target_column = available_targets[0]

        print(f"Selected target column: {target_column}")

        exclude_cols = [target_column] + categorical_cols

        if 'timestamp' in df.columns:
            exclude_cols.append('timestamp')

        for col in df.columns:
            if any(id_kw in col.lower() for id_kw in ['id', 'station', 'code']):
                exclude_cols.append(col)

        feature_columns = [col for col in numeric_cols if col not in exclude_cols]
        print(f"Using {len(feature_columns)} feature columns: {feature_columns}")

        X = df[feature_columns].values
        y = df[target_column].values.reshape(-1, 1)

        X_scaler = StandardScaler()
        y_scaler = StandardScaler()

        X_scaled = X_scaler.fit_transform(X)
        y_scaled = y_scaler.fit_transform(y)

        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X_scaled, y_scaled, test_size=0.15, random_state=42
        )

        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.15/0.85, random_state=42
        )

        print(f"Training set size: {X_train.shape[0]}")
        print(f"Validation set size: {X_val.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")

        scalers = {'X': X_scaler, 'y': y_scaler}

        return X_train, X_val, X_test, y_train, y_val, y_test, scalers, target_column

    except Exception as e:
        print(f"Error loading data: {e}")

        print("Creating synthetic data for demonstration...")

        num_samples = 10000
        num_features = 10

        X = np.random.randn(num_samples, num_features)

        y = 0.5 * X[:, 0] + 0.3 * X[:, 1] - 0.2 * X[:, 2] + 0.1 * np.random.randn(num_samples)
        y = y.reshape(-1, 1)

        X_scaler = StandardScaler()
        y_scaler = StandardScaler()

        X_scaled = X_scaler.fit_transform(X)
        y_scaled = y_scaler.fit_transform(y)

        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X_scaled, y_scaled, test_size=0.15, random_state=42
        )

        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.15/0.85, random_state=42
        )

        scalers = {'X': X_scaler, 'y': y_scaler}

        return X_train, X_val, X_test, y_train, y_val, y_test, scalers, "synthetic_target"

class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)

        output, _ = self.lstm(x, (h0, c0))

        output = output[:, -1, :]

        output = self.fc(output)

        return output

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        output, _ = self.rnn(x, h0)

        output = output[:, -1, :]

        output = self.fc(output)

        return output

class TransformerModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
        super(TransformerModel, self).__init__()

        self.input_proj = nn.Linear(input_size, hidden_size)

        self.positional_encoding = nn.Parameter(
            torch.zeros(1, SEQUENCE_LENGTH, hidden_size)
        )

        nhead = 4
        if hidden_size % nhead != 0:
            nhead = 2
            if hidden_size % nhead != 0:
                nhead = 1

        encoder_layers = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=nhead,
            dim_feedforward=hidden_size * 4,
            dropout=dropout,
            batch_first=True
        )

        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layers,
            num_layers=num_layers
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.input_proj(x)
        x = x + self.positional_encoding

        output = self.transformer_encoder(x)

        output = output[:, -1, :]

        output = self.fc(output)

        return output

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = model.to(device)

    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch_features, batch_targets in train_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * batch_features.size(0)

        train_loss = train_loss / len(train_loader.dataset)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch_features, batch_targets in val_loader:
                batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

                outputs = model(batch_features)
                loss = criterion(outputs, batch_targets)

                val_loss += loss.item() * batch_features.size(0)

        val_loss = val_loss / len(val_loader.dataset)
        val_losses.append(val_loss)

        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    history = {
        'train_loss': train_losses,
        'val_loss': val_losses
    }

    return model, history

def evaluate_model(model, test_loader, criterion, y_scaler=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    test_loss = 0.0
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch_features, batch_targets in test_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets)

            test_loss += loss.item() * batch_features.size(0)

            predictions.append(outputs.cpu().numpy())
            actuals.append(batch_targets.cpu().numpy())

    test_loss = test_loss / len(test_loader.dataset)

    predictions = np.concatenate(predictions)
    actuals = np.concatenate(actuals)

    if y_scaler:
        predictions = y_scaler.inverse_transform(predictions)
        actuals = y_scaler.inverse_transform(actuals)

    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)

    metrics = {
        'test_loss': test_loss,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'predictions': predictions,
        'actuals': actuals
    }

    print(f'Test Loss: {test_loss:.4f}')
    print(f'MSE: {mse:.4f}')
    print(f'RMSE: {rmse:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'R²: {r2:.4f}')

    return metrics

def plot_loss_curves(histories, model_names):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    for i, history in enumerate(histories):
        plt.plot(history['train_loss'], label=model_names[i])
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    for i, history in enumerate(histories):
        plt.plot(history['val_loss'], label=model_names[i])
    plt.title('Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.savefig('loss_curves.png')
    plt.show()

def plot_predictions(metrics_list, model_names, target_name):
    num_models = len(metrics_list)
    fig, axs = plt.subplots(1, num_models, figsize=(15, 5))

    for i, metrics in enumerate(metrics_list):
        ax = axs[i] if num_models > 1 else axs
        ax.scatter(metrics['actuals'], metrics['predictions'], alpha=0.5)
        ax.plot([np.min(metrics['actuals']), np.max(metrics['actuals'])],
                [np.min(metrics['actuals']), np.max(metrics['actuals'])],
                'r--')
        ax.set_title(f'{model_names[i]}')
        ax.set_xlabel(f'Actual {target_name}')
        ax.set_ylabel(f'Predicted {target_name}')

    plt.tight_layout()
    plt.savefig('predictions.png')
    plt.show()

def compare_metrics(metrics_list, model_names, target_name):
    metrics_df = pd.DataFrame({
        'Model': model_names,
        'Test Loss': [metrics['test_loss'] for metrics in metrics_list],
        'MSE': [metrics['mse'] for metrics in metrics_list],
        'RMSE': [metrics['rmse'] for metrics in metrics_list],
        'MAE': [metrics['mae'] for metrics in metrics_list],
        'R²': [metrics['r2'] for metrics in metrics_list]
    })

    print(f"\nModel Comparison for {target_name} Prediction:")
    print(metrics_df.to_string(index=False))

    plt.figure(figsize=(12, 6))

    metrics_to_plot = ['Test Loss', 'MSE', 'RMSE', 'MAE']
    num_metrics = len(metrics_to_plot)

    for i, metric in enumerate(metrics_to_plot):
        plt.subplot(2, 2, i+1)
        plt.bar(model_names, metrics_df[metric])
        plt.title(f'{metric} for {target_name}')
        plt.xticks(rotation=45)

    plt.tight_layout()
    plt.savefig('metrics_comparison.png')
    plt.show()

    plt.figure(figsize=(8, 4))
    plt.bar(model_names, metrics_df['R²'])
    plt.title(f'R² Score for {target_name} (higher is better)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('r2_comparison.png')
    plt.show()

    return metrics_df

print("Loading and preprocessing Seoul Air Quality data...")
X_train, X_val, X_test, y_train, y_val, y_test, scalers, target_name = load_and_preprocess_seoul_data(DATA_PATH)

input_size = X_train.shape[1]
output_size = y_train.shape[1]

print(f"Input size: {input_size}")
print(f"Output size: {output_size}")
print(f"Target variable: {target_name}")

train_dataset = AirQualityDataset(X_train, y_train)
val_dataset = AirQualityDataset(X_val, y_val)
test_dataset = AirQualityDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

bilstm_model = BiLSTMModel(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=output_size,
    dropout=DROPOUT
)

rnn_model = RNNModel(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=output_size,
    dropout=DROPOUT
)

transformer_model = TransformerModel(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=output_size,
    dropout=DROPOUT
)

criterion = nn.MSELoss()

bilstm_optimizer = optim.Adam(bilstm_model.parameters(), lr=LEARNING_RATE)
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=LEARNING_RATE)
transformer_optimizer = optim.Adam(transformer_model.parameters(), lr=LEARNING_RATE)

print("\nTraining Bi-LSTM model...")
bilstm_model, bilstm_history = train_model(
    bilstm_model, train_loader, val_loader, criterion, bilstm_optimizer, NUM_EPOCHS
)

print("\nTraining RNN model...")
rnn_model, rnn_history = train_model(
    rnn_model, train_loader, val_loader, criterion, rnn_optimizer, NUM_EPOCHS
)

print("\nTraining Transformer model...")
transformer_model, transformer_history = train_model(
    transformer_model, train_loader, val_loader, criterion, transformer_optimizer, NUM_EPOCHS
)

print("\nEvaluating Bi-LSTM model...")
bilstm_metrics = evaluate_model(bilstm_model, test_loader, criterion, scalers['y'])

print("\nEvaluating RNN model...")
rnn_metrics = evaluate_model(rnn_model, test_loader, criterion, scalers['y'])

print("\nEvaluating Transformer model...")
transformer_metrics = evaluate_model(transformer_model, test_loader, criterion, scalers['y'])

histories = [bilstm_history, rnn_history, transformer_history]
model_names = ['Bi-LSTM', 'RNN', 'Transformer']
plot_loss_curves(histories, model_names)

metrics_list = [bilstm_metrics, rnn_metrics, transformer_metrics]
plot_predictions(metrics_list, model_names, target_name)

metrics_df = compare_metrics(metrics_list, model_names, target_name)

output_dir = "/content/drive/MyDrive/air_quality_models"
try:
    os.makedirs(output_dir, exist_ok=True)
    print(f"\nSaving models to {output_dir}")

    torch.save(bilstm_model.state_dict(), f'{output_dir}/bilstm_model_seoul.pth')
    torch.save(rnn_model.state_dict(), f'{output_dir}/rnn_model_seoul.pth')
    torch.save(transformer_model.state_dict(), f'{output_dir}/transformer_model_seoul.pth')

    print("Models saved successfully!")
except Exception as e:
    print(f"Error saving models: {e}")
    print("Saving models to current directory instead")

    torch.save(bilstm_model.state_dict(), 'bilstm_model_seoul.pth')
    torch.save(rnn_model.state_dict(), 'rnn_model_seoul.pth')
    torch.save(transformer_model.state_dict(), 'transformer_model_seoul.pth')

print("\nTraining and evaluation completed!")
print("Loss curves saved as 'loss_curves.png'")
print("Predictions plots saved as 'predictions.png'")
print("Metrics comparison saved as 'metrics_comparison.png' and 'r2_comparison.png'")

#### Beijing

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import glob
# from google.colab import drive
# drive.mount('/content/drive')

torch.manual_seed(42)
np.random.seed(42)

DATA_PATH = "/content/drive/MyDrive/data/beijing"
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
SEQUENCE_LENGTH = 24  # 24 hours sequence
HIDDEN_SIZE = 64
NUM_LAYERS = 2
DROPOUT = 0.2

class AirQualityDataset(Dataset):
    def __init__(self, features, targets, seq_length=SEQUENCE_LENGTH):
        self.features = features
        self.targets = targets
        self.seq_length = seq_length

    def __len__(self):
        return len(self.features) - self.seq_length

    def __getitem__(self, idx):
        x = self.features[idx:idx + self.seq_length]
        y = self.targets[idx + self.seq_length]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

def load_and_preprocess_beijing_data(data_dir):
    try:
        print(f"Looking for CSV files in {data_dir}")
        csv_files = glob.glob(os.path.join(data_dir, '*.csv'))

        if not csv_files:
            print(f"No CSV files found directly in {data_dir}, looking in subdirectories...")
            csv_files = glob.glob(os.path.join(data_dir, '**/*.csv'), recursive=True)

        if not csv_files:
            raise FileNotFoundError(f"No CSV files found in {data_dir} or its subdirectories")

        print(f"Found {len(csv_files)} CSV files: {[os.path.basename(f) for f in csv_files]}")

        # Beijing dataset often has specific formatting issues, so we need a more robust loading approach
        all_dfs = []
        for file in csv_files:
            try:
                print(f"Loading {os.path.basename(file)}...")

                # Try different encodings if needed
                try:
                    df = pd.read_csv(file)
                except UnicodeDecodeError:
                    print(f"  Retrying with different encoding for {file}")
                    df = pd.read_csv(file, encoding='latin1')

                # Beijing data typically has a station name or number in the filename
                station_name = os.path.basename(file).split('.')[0]
                if 'station' not in df.columns:
                    df['station'] = station_name

                print(f"  Shape: {df.shape}")
                print(f"  Columns: {df.columns.tolist()[:5]}...")
                all_dfs.append(df)
            except Exception as e:
                print(f"  Error loading {file}: {e}")

        print(f"Successfully loaded {len(all_dfs)} CSV files")

        if not all_dfs:
            raise ValueError("No data could be loaded from CSV files")

        print("Concatenating dataframes...")
        df = pd.concat(all_dfs, ignore_index=True)
        print(f"Combined data shape: {df.shape}")

        # Beijing dataset typically has year, month, day, hour columns instead of a single timestamp
        print("Creating timestamp from date components...")
        time_components = []
        for col in ['year', 'month', 'day', 'hour']:
            if col in df.columns:
                time_components.append(col)

        if len(time_components) >= 3:  # We need at least year, month, day
            if 'hour' not in time_components:
                df['hour'] = 0  # Default to midnight if hour not provided

            # Convert columns to string and ensure two digits for month, day, hour
            for col in ['month', 'day', 'hour']:
                if col in df.columns:
                    df[col] = df[col].astype(str).str.zfill(2)

            # Combine date parts into a timestamp
            if 'hour' in df.columns:
                df['timestamp'] = pd.to_datetime(
                    df['year'].astype(str) + '-' +
                    df['month'].astype(str) + '-' +
                    df['day'].astype(str) + ' ' +
                    df['hour'].astype(str) + ':00:00'
                )
            else:
                df['timestamp'] = pd.to_datetime(
                    df['year'].astype(str) + '-' +
                    df['month'].astype(str) + '-' +
                    df['day'].astype(str)
                )
        elif 'date' in df.columns and 'time' in df.columns:
            df['timestamp'] = pd.to_datetime(df['date'] + ' ' + df['time'])
        elif 'date' in df.columns:
            df['timestamp'] = pd.to_datetime(df['date'])
        else:
            print("Could not create timestamp from columns. Looking for existing timestamp column...")
            timestamp_cols = [col for col in df.columns if 'time' in col.lower() or 'date' in col.lower()]
            if timestamp_cols:
                print(f"Using {timestamp_cols[0]} as timestamp")
                df['timestamp'] = pd.to_datetime(df[timestamp_cols[0]])
            else:
                print("No timestamp column found. Data will not be sorted chronologically.")

        if 'timestamp' in df.columns:
            print("Sorting data by timestamp...")
            df = df.sort_values('timestamp')

        # Identify air quality target variables (common in Beijing dataset)
        print("Identifying potential target variables...")
        target_options = ['PM2.5', 'PM10', 'NO2', 'CO', 'O3', 'SO2', 'PM25', 'PM_25', 'PM_10']

        # Handle Beijing dataset column naming variations
        beijing_columns = df.columns.tolist()
        available_targets = []

        for target in target_options:
            matches = [col for col in beijing_columns if (
                target.lower() in col.lower() or
                target.replace('.', '_').lower() in col.lower() or
                target.replace(' ', '').lower() in col.lower()
            )]
            available_targets.extend(matches)

        if not available_targets:
            print("No standard air quality target columns found.")
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            target_column = numeric_cols[-1]
        else:
            # Prioritize PM2.5 if available, otherwise use the first available target
            pm25_cols = [col for col in available_targets if 'pm2.5' in col.lower() or 'pm25' in col.lower() or 'pm_25' in col.lower()]
            target_column = pm25_cols[0] if pm25_cols else available_targets[0]

        print(f"Selected target column: {target_column}")

        # Handle missing values
        print("Handling missing values...")
        missing_stats = df.isnull().sum()
        print(f"Missing values per column:\n{missing_stats[missing_stats > 0]}")

        # Beijing data often uses specific values for missing data
        for col in df.columns:
            # Replace common missing value indicators
            if df[col].dtype in [np.float64, np.int64]:
                # Some datasets use -999, -9999, or very large negative values to indicate missing data
                df[col] = df[col].replace([-999, -9999], np.nan)
                # Also replace unreasonably large or small values
                if col != 'year':  # Don't filter year values
                    df.loc[df[col] > 9999, col] = np.nan
                    df.loc[df[col] < -9999, col] = np.nan

        # Identify numeric and non-numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = [col for col in df.columns if col not in numeric_cols]
        print(f"Numeric columns: {len(numeric_cols)}")
        print(f"Categorical columns: {len(categorical_cols)}")

        # Fill missing values in numeric columns with median
        for col in numeric_cols:
            if df[col].isnull().sum() > 0:
                print(f"  Filling NaN in {col} with median")
                df[col] = df[col].fillna(df[col].median())

        # Drop any remaining rows with NaN in numeric columns
        df = df.dropna(subset=numeric_cols)

        # Exclude non-feature columns from the feature set
        exclude_cols = [target_column] + categorical_cols

        # Also exclude date-related columns and station identifiers
        exclude_patterns = ['year', 'month', 'day', 'hour', 'date', 'time', 'timestamp', 'station', 'lat', 'lon', 'latitude', 'longitude']
        for col in df.columns:
            if any(pattern in col.lower() for pattern in exclude_patterns):
                if col not in exclude_cols:
                    exclude_cols.append(col)

        # Get the final feature columns
        feature_columns = [col for col in numeric_cols if col not in exclude_cols]
        print(f"Using {len(feature_columns)} feature columns: {feature_columns}")

        # Check if we have enough features
        if len(feature_columns) < 2:
            print("Warning: Very few feature columns available. Adding additional features...")
            # If we don't have enough features, we can use lagged versions of the target as features
            if 'timestamp' in df.columns and len(df) > 24:
                print("Creating lag features of the target variable...")
                for lag in range(1, 13):  # Create 12 lag features
                    lag_col = f"{target_column}_lag_{lag}"
                    df[lag_col] = df[target_column].shift(lag)
                    feature_columns.append(lag_col)

                # Drop rows with NaN from lag creation
                df = df.dropna()

        # Handle extremely large datasets by sampling if needed
        if len(df) > 100000:
            print(f"Dataset is very large ({len(df)} rows). Sampling 100,000 rows...")
            df = df.sample(n=100000, random_state=42)

        # Extract features and target
        X = df[feature_columns].values
        y = df[target_column].values.reshape(-1, 1)

        print(f"Final data shape - X: {X.shape}, y: {y.shape}")

        # Normalize the data
        X_scaler = StandardScaler()
        y_scaler = StandardScaler()

        X_scaled = X_scaler.fit_transform(X)
        y_scaled = y_scaler.fit_transform(y)

        # Split the data
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X_scaled, y_scaled, test_size=0.15, random_state=42
        )

        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.15/0.85, random_state=42
        )

        print(f"Training set size: {X_train.shape[0]}")
        print(f"Validation set size: {X_val.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")

        scalers = {'X': X_scaler, 'y': y_scaler}

        return X_train, X_val, X_test, y_train, y_val, y_test, scalers, target_column

    except Exception as e:
        print(f"Error loading data: {e}")

        print("Creating synthetic data for demonstration...")

        num_samples = 10000
        num_features = 10

        X = np.random.randn(num_samples, num_features)

        y = 0.5 * X[:, 0] + 0.3 * X[:, 1] - 0.2 * X[:, 2] + 0.1 * np.random.randn(num_samples)
        y = y.reshape(-1, 1)

        X_scaler = StandardScaler()
        y_scaler = StandardScaler()

        X_scaled = X_scaler.fit_transform(X)
        y_scaled = y_scaler.fit_transform(y)

        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X_scaled, y_scaled, test_size=0.15, random_state=42
        )

        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.15/0.85, random_state=42
        )

        scalers = {'X': X_scaler, 'y': y_scaler}

        return X_train, X_val, X_test, y_train, y_val, y_test, scalers, "synthetic_target"

class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)

        output, _ = self.lstm(x, (h0, c0))

        output = output[:, -1, :]

        output = self.fc(output)

        return output

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        output, _ = self.rnn(x, h0)

        output = output[:, -1, :]

        output = self.fc(output)

        return output

class TransformerModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
        super(TransformerModel, self).__init__()

        self.input_proj = nn.Linear(input_size, hidden_size)

        self.positional_encoding = nn.Parameter(
            torch.zeros(1, SEQUENCE_LENGTH, hidden_size)
        )

        nhead = 4
        if hidden_size % nhead != 0:
            nhead = 2
            if hidden_size % nhead != 0:
                nhead = 1

        encoder_layers = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=nhead,
            dim_feedforward=hidden_size * 4,
            dropout=dropout,
            batch_first=True
        )

        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layers,
            num_layers=num_layers
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.input_proj(x)
        x = x + self.positional_encoding

        output = self.transformer_encoder(x)

        output = output[:, -1, :]

        output = self.fc(output)

        return output

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=NUM_EPOCHS):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = model.to(device)

    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch_features, batch_targets in train_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * batch_features.size(0)

        train_loss = train_loss / len(train_loader.dataset)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch_features, batch_targets in val_loader:
                batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

                outputs = model(batch_features)
                loss = criterion(outputs, batch_targets)

                val_loss += loss.item() * batch_features.size(0)

        val_loss = val_loss / len(val_loader.dataset)
        val_losses.append(val_loss)

        if (epoch + 1) % 5 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    history = {
        'train_loss': train_losses,
        'val_loss': val_losses
    }

    return model, history

def evaluate_model(model, test_loader, criterion, y_scaler=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    test_loss = 0.0
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch_features, batch_targets in test_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets)

            test_loss += loss.item() * batch_features.size(0)

            predictions.append(outputs.cpu().numpy())
            actuals.append(batch_targets.cpu().numpy())

    test_loss = test_loss / len(test_loader.dataset)

    predictions = np.concatenate(predictions)
    actuals = np.concatenate(actuals)

    if y_scaler:
        predictions = y_scaler.inverse_transform(predictions)
        actuals = y_scaler.inverse_transform(actuals)

    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)

    metrics = {
        'test_loss': test_loss,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'predictions': predictions,
        'actuals': actuals
    }

    print(f'Test Loss: {test_loss:.4f}')
    print(f'MSE: {mse:.4f}')
    print(f'RMSE: {rmse:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'R²: {r2:.4f}')

    return metrics

def plot_loss_curves(histories, model_names):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    for i, history in enumerate(histories):
        plt.plot(history['train_loss'], label=model_names[i])
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    for i, history in enumerate(histories):
        plt.plot(history['val_loss'], label=model_names[i])
    plt.title('Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.savefig('loss_curves.png')
    plt.show()

def plot_predictions(metrics_list, model_names, target_name):
    num_models = len(metrics_list)
    fig, axs = plt.subplots(1, num_models, figsize=(15, 5))

    for i, metrics in enumerate(metrics_list):
        ax = axs[i] if num_models > 1 else axs
        ax.scatter(metrics['actuals'], metrics['predictions'], alpha=0.5)
        ax.plot([np.min(metrics['actuals']), np.max(metrics['actuals'])],
                [np.min(metrics['actuals']), np.max(metrics['actuals'])],
                'r--')
        ax.set_title(f'{model_names[i]}')
        ax.set_xlabel(f'Actual {target_name}')
        ax.set_ylabel(f'Predicted {target_name}')

    plt.tight_layout()
    plt.savefig('predictions.png')
    plt.show()

def compare_metrics(metrics_list, model_names, target_name):
    metrics_df = pd.DataFrame({
        'Model': model_names,
        'Test Loss': [metrics['test_loss'] for metrics in metrics_list],
        'MSE': [metrics['mse'] for metrics in metrics_list],
        'RMSE': [metrics['rmse'] for metrics in metrics_list],
        'MAE': [metrics['mae'] for metrics in metrics_list],
        'R²': [metrics['r2'] for metrics in metrics_list]
    })

    print(f"\nModel Comparison for {target_name} Prediction:")
    print(metrics_df.to_string(index=False))

    plt.figure(figsize=(12, 6))

    metrics_to_plot = ['Test Loss', 'MSE', 'RMSE', 'MAE']
    num_metrics = len(metrics_to_plot)

    for i, metric in enumerate(metrics_to_plot):
        plt.subplot(2, 2, i+1)
        plt.bar(model_names, metrics_df[metric])
        plt.title(f'{metric} for {target_name}')
        plt.xticks(rotation=45)

    plt.tight_layout()
    plt.savefig('metrics_comparison.png')
    plt.show()

    plt.figure(figsize=(8, 4))
    plt.bar(model_names, metrics_df['R²'])
    plt.title(f'R² Score for {target_name} (higher is better)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('r2_comparison.png')
    plt.show()

    return metrics_df

print("Loading and preprocessing Beijing Air Quality data...")
X_train, X_val, X_test, y_train, y_val, y_test, scalers, target_name = load_and_preprocess_beijing_data(DATA_PATH)

input_size = X_train.shape[1]
output_size = y_train.shape[1]

print(f"Input size: {input_size}")
print(f"Output size: {output_size}")
print(f"Target variable: {target_name}")

train_dataset = AirQualityDataset(X_train, y_train)
val_dataset = AirQualityDataset(X_val, y_val)
test_dataset = AirQualityDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

bilstm_model = BiLSTMModel(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=output_size,
    dropout=DROPOUT
)

rnn_model = RNNModel(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=output_size,
    dropout=DROPOUT
)

transformer_model = TransformerModel(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=output_size,
    dropout=DROPOUT
)

criterion = nn.MSELoss()

bilstm_optimizer = optim.Adam(bilstm_model.parameters(), lr=LEARNING_RATE)
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=LEARNING_RATE)
transformer_optimizer = optim.Adam(transformer_model.parameters(), lr=LEARNING_RATE)

print("\nTraining Bi-LSTM model...")
bilstm_model, bilstm_history = train_model(
    bilstm_model, train_loader, val_loader, criterion, bilstm_optimizer, NUM_EPOCHS
)

print("\nTraining RNN model...")
rnn_model, rnn_history = train_model(
    rnn_model, train_loader, val_loader, criterion, rnn_optimizer, NUM_EPOCHS
)

print("\nTraining Transformer model...")
transformer_model, transformer_history = train_model(
    transformer_model, train_loader, val_loader, criterion, transformer_optimizer, NUM_EPOCHS
)

print("\nEvaluating Bi-LSTM model...")
bilstm_metrics = evaluate_model(bilstm_model, test_loader, criterion, scalers['y'])

print("\nEvaluating RNN model...")
rnn_metrics = evaluate_model(rnn_model, test_loader, criterion, scalers['y'])

print("\nEvaluating Transformer model...")
transformer_metrics = evaluate_model(transformer_model, test_loader, criterion, scalers['y'])

histories = [bilstm_history, rnn_history, transformer_history]
model_names = ['Bi-LSTM', 'RNN', 'Transformer']
plot_loss_curves(histories, model_names)

metrics_list = [bilstm_metrics, rnn_metrics, transformer_metrics]
plot_predictions(metrics_list, model_names, target_name)

metrics_df = compare_metrics(metrics_list, model_names, target_name)

output_dir = "/content/drive/MyDrive/air_quality_models"
try:
    os.makedirs(output_dir, exist_ok=True)
    print(f"\nSaving models to {output_dir}")

    torch.save(bilstm_model.state_dict(), f'{output_dir}/bilstm_model_beijing.pth')
    torch.save(rnn_model.state_dict(), f'{output_dir}/rnn_model_beijing.pth')
    torch.save(transformer_model.state_dict(), f'{output_dir}/transformer_model_beijing.pth')

    print("Models saved successfully!")
except Exception as e:
    print(f"Error saving models: {e}")
    print("Saving models to current directory instead")

    torch.save(bilstm_model.state_dict(), 'bilstm_model_beijing.pth')
    torch.save(rnn_model.state_dict(), 'rnn_model_beijing.pth')
    torch.save(transformer_model.state_dict(), 'transformer_model_beijing.pth')

print("\nTraining and evaluation completed!")
print("Loss curves saved as 'loss_curves.png'")
print("Predictions plots saved as 'predictions.png'")
print("Metrics comparison saved as 'metrics_comparison.png' and 'r2_comparison.png'")

#### Dublin

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
from google.colab import drive

drive.mount('/content/drive')
torch.manual_seed(42)
np.random.seed(42)

DATA_PATH = "/content/drive/MyDrive/data/airview_dublincity_roaddata_ugm3.csv"
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
SEQUENCE_LENGTH = 24
HIDDEN_SIZE = 64
NUM_LAYERS = 2
DROPOUT = 0.3

class AirQualityDataset(Dataset):
    def __init__(self, features, targets, seq_length=SEQUENCE_LENGTH):
        self.features = features
        self.targets = targets
        self.seq_length = seq_length

    def __len__(self):
        return len(self.features) - self.seq_length

    def __getitem__(self, idx):
        x = self.features[idx:idx + self.seq_length]
        y = self.targets[idx + self.seq_length]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

def load_csv_data(filepath):
    print(f"Loading CSV data from {filepath}")

    try:
        df = pd.read_csv(filepath, delimiter=None, engine='python')
        print("Successfully loaded data with auto-detected delimiter")
        return df
    except:
        print("Trying to handle special delimiters or file format issues...")

        with open(filepath, 'r') as f:
            first_line = f.readline().strip()

        if ',' in first_line:
            sep = ','
        elif ';' in first_line:
            sep = ';'
        elif '\t' in first_line:
            sep = '\t'
        else:
            sep = None

        print(f"Detected delimiter: {sep if sep else 'unknown'}")

        try:
            df = pd.read_csv(filepath, sep=sep, engine='python', error_bad_lines=False)
            print("Successfully loaded data with detected delimiter and skipping bad lines")
            return df
        except:
            print("Trying to load with maximum error handling...")
            try:
                df = pd.read_csv(filepath, sep=sep, engine='python', error_bad_lines=False,
                                 warn_bad_lines=False, on_bad_lines='skip')
                print("Successfully loaded data with maximum error tolerance")
                return df
            except Exception as e:
                raise ValueError(f"Failed to load CSV file: {str(e)}")

def process_data(df, target_col=None):
    print("Processing data...")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Sample data:\n{df.head()}")

    string_cols = df.select_dtypes(include=['object']).columns.tolist()
    print(f"String columns that will be handled: {string_cols}")

    geometry_cols = [col for col in string_cols if
                    any(geo_kw in col.lower() for geo_kw in ['geom', 'shape', 'line', 'point', 'polygon'])]

    drop_cols = geometry_cols

    for col in string_cols:
        if col not in drop_cols:
            try:
                sample = df[col].iloc[0]
                if isinstance(sample, str) and ('LINESTRING' in sample or 'POINT' in sample or 'POLYGON' in sample):
                    drop_cols.append(col)
            except:
                pass

    print(f"Dropping geometry columns: {drop_cols}")
    df = df.drop(columns=drop_cols, errors='ignore')

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    print(f"Numeric columns: {numeric_cols}")

    pollution_keywords = ['no2', 'pm25', 'pm2_5', 'pm2.5', 'pm10', 'co', 'co2', 'o3', 'bc', 'ufp']

    if target_col is None or target_col not in numeric_cols:
        potential_targets = [col for col in numeric_cols if
                            any(kw in col.lower() for kw in pollution_keywords)]

        if potential_targets:
            target_col = potential_targets[0]
            print(f"Automatically selected target column: {target_col}")
        else:
            target_col = numeric_cols[-1]
            print(f"No pollution column found, using last numeric column as target: {target_col}")

    feature_cols = [col for col in numeric_cols if col != target_col]

    print(f"Using {len(feature_cols)} feature columns and target: {target_col}")

    df = df.dropna(subset=[target_col] + feature_cols)

    X = df[feature_cols].values
    y = df[target_col].values.reshape(-1, 1)

    X_scaler = StandardScaler()
    y_scaler = StandardScaler()

    X_scaled = X_scaler.fit_transform(X)
    y_scaled = y_scaler.fit_transform(y)

    train_idx = int(0.7 * len(X_scaled))
    val_idx = int(0.85 * len(X_scaled))

    X_train = X_scaled[:train_idx]
    y_train = y_scaled[:train_idx]

    X_val = X_scaled[train_idx:val_idx]
    y_val = y_scaled[train_idx:val_idx]

    X_test = X_scaled[val_idx:]
    y_test = y_scaled[val_idx:]

    return X_train, X_val, X_test, y_train, y_val, y_test, {'X': X_scaler, 'y': y_scaler}, target_col

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.3):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        output, _ = self.lstm(x, (h0, c0))
        output = output[:, -1, :]
        output = self.layer_norm(output)
        output = self.dropout(output)
        output = self.fc(output)

        return output

class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.3):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        output, _ = self.gru(x, h0)
        output = output[:, -1, :]
        output = self.layer_norm(output)
        output = self.dropout(output)
        output = self.fc(output)

        return output

def huber_loss(y_pred, y_true, delta=1.0):
    residual = torch.abs(y_pred - y_true)
    condition = residual < delta
    squared_loss = 0.5 * residual ** 2
    linear_loss = delta * (residual - 0.5 * delta)

    return torch.mean(torch.where(condition, squared_loss, linear_loss))

def train_model(model, train_loader, val_loader, optimizer, num_epochs=NUM_EPOCHS):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch_features, batch_targets in train_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

            outputs = model(batch_features)
            loss = huber_loss(outputs, batch_targets)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_loss += loss.item() * batch_features.size(0)

        train_loss = train_loss / len(train_loader.dataset)
        train_losses.append(train_loss)

        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch_features, batch_targets in val_loader:
                batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

                outputs = model(batch_features)
                loss = huber_loss(outputs, batch_targets)

                val_loss += loss.item() * batch_features.size(0)

        val_loss = val_loss / len(val_loader.dataset)
        val_losses.append(val_loss)

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    return model, {'train_loss': train_losses, 'val_loss': val_losses}

def evaluate_model(model, test_loader, y_scaler=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    test_loss = 0.0
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch_features, batch_targets in test_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)

            outputs = model(batch_features)
            loss = huber_loss(outputs, batch_targets)

            test_loss += loss.item() * batch_features.size(0)

            predictions.append(outputs.cpu().numpy())
            actuals.append(batch_targets.cpu().numpy())

    test_loss = test_loss / len(test_loader.dataset)

    predictions = np.concatenate(predictions)
    actuals = np.concatenate(actuals)

    if y_scaler:
        predictions = y_scaler.inverse_transform(predictions)
        actuals = y_scaler.inverse_transform(actuals)

    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)

    print(f'Test Loss: {test_loss:.4f}')
    print(f'MSE: {mse:.4f}')
    print(f'RMSE: {rmse:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'R²: {r2:.4f}')

    return {'test_loss': test_loss, 'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2,
            'predictions': predictions, 'actuals': actuals}

def plot_comparison(lstm_metrics, gru_metrics, target_name):
    plt.figure(figsize=(15, 10))

    plt.subplot(2, 2, 1)
    plt.scatter(lstm_metrics['actuals'], lstm_metrics['predictions'], alpha=0.5)
    plt.plot([np.min(lstm_metrics['actuals']), np.max(lstm_metrics['actuals'])],
             [np.min(lstm_metrics['actuals']), np.max(lstm_metrics['actuals'])], 'r--')
    plt.title('LSTM Predictions')
    plt.xlabel(f'Actual {target_name}')
    plt.ylabel(f'Predicted {target_name}')

    plt.subplot(2, 2, 2)
    plt.scatter(gru_metrics['actuals'], gru_metrics['predictions'], alpha=0.5)
    plt.plot([np.min(gru_metrics['actuals']), np.max(gru_metrics['actuals'])],
             [np.min(gru_metrics['actuals']), np.max(gru_metrics['actuals'])], 'r--')
    plt.title('GRU Predictions')
    plt.xlabel(f'Actual {target_name}')
    plt.ylabel(f'Predicted {target_name}')

    plt.subplot(2, 2, 3)
    plt.hist(lstm_metrics['actuals'] - lstm_metrics['predictions'], bins=50, alpha=0.7)
    plt.title('LSTM Error Distribution')
    plt.xlabel('Error')
    plt.ylabel('Frequency')

    plt.subplot(2, 2, 4)
    plt.hist(gru_metrics['actuals'] - gru_metrics['predictions'], bins=50, alpha=0.7)
    plt.title('GRU Error Distribution')
    plt.xlabel('Error')
    plt.ylabel('Frequency')

    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.show()

    metrics_df = pd.DataFrame({
        'Model': ['LSTM', 'GRU'],
        'MSE': [lstm_metrics['mse'], gru_metrics['mse']],
        'RMSE': [lstm_metrics['rmse'], gru_metrics['rmse']],
        'MAE': [lstm_metrics['mae'], gru_metrics['mae']],
        'R²': [lstm_metrics['r2'], gru_metrics['r2']]
    })

    print("\nModel Comparison:")
    print(metrics_df)

    plt.figure(figsize=(10, 6))
    metrics = ['MSE', 'RMSE', 'MAE']
    x = np.arange(len(metrics))
    width = 0.35

    plt.bar(x - width/2, [lstm_metrics['mse'], lstm_metrics['rmse'], lstm_metrics['mae']],
            width, label='LSTM')
    plt.bar(x + width/2, [gru_metrics['mse'], gru_metrics['rmse'], gru_metrics['mae']],
            width, label='GRU')

    plt.ylabel('Value')
    plt.title('Error Metrics Comparison')
    plt.xticks(x, metrics)
    plt.legend()

    plt.tight_layout()
    plt.savefig('metrics_comparison.png')
    plt.show()

try:
    print("Loading and processing Dublin Air Quality data...")

    df = load_csv_data(DATA_PATH)
    X_train, X_val, X_test, y_train, y_val, y_test, scalers, target_name = process_data(df)

    if len(X_train) < SEQUENCE_LENGTH + 1:
        print(f"WARNING: Not enough data for sequence length {SEQUENCE_LENGTH}")
        SEQUENCE_LENGTH = max(1, len(X_train) // 4)
        print(f"Reducing sequence length to {SEQUENCE_LENGTH}")

    input_size = X_train.shape[1]
    output_size = y_train.shape[1]

    print(f"Input size: {input_size}")
    print(f"Output size: {output_size}")
    print(f"Target variable: {target_name}")
    print(f"Training samples: {len(X_train)}")

    train_dataset = AirQualityDataset(X_train, y_train, SEQUENCE_LENGTH)
    val_dataset = AirQualityDataset(X_val, y_val, SEQUENCE_LENGTH)
    test_dataset = AirQualityDataset(X_test, y_test, SEQUENCE_LENGTH)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    lstm_model = LSTMModel(
        input_size=input_size,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYERS,
        output_size=output_size,
        dropout=DROPOUT
    )

    gru_model = GRUModel(
        input_size=input_size,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYERS,
        output_size=output_size,
        dropout=DROPOUT
    )

    lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
    gru_optimizer = optim.Adam(gru_model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

    print("\nTraining LSTM model...")
    lstm_model, lstm_history = train_model(
        lstm_model, train_loader, val_loader, lstm_optimizer, NUM_EPOCHS
    )

    print("\nTraining GRU model...")
    gru_model, gru_history = train_model(
        gru_model, train_loader, val_loader, gru_optimizer, NUM_EPOCHS
    )

    print("\nEvaluating LSTM model...")
    lstm_metrics = evaluate_model(lstm_model, test_loader, scalers['y'])

    print("\nEvaluating GRU model...")
    gru_metrics = evaluate_model(gru_model, test_loader, scalers['y'])

    plot_comparison(lstm_metrics, gru_metrics, target_name)

    print("\nSaving models...")
    torch.save(lstm_model.state_dict(), 'lstm_model_dublin.pth')
    torch.save(gru_model.state_dict(), 'gru_model_dublin.pth')

    print("\nTraining and evaluation completed!")

except Exception as e:
    print(f"ERROR: {str(e)}")
    print("\nPlease check that your dataset is in the correct format and location.")
    print("The code will try to handle various formats but needs a valid CSV file.")