<a href="https://colab.research.google.com/github/manyamule/WCEHackathon2025_Introspectors/blob/main/TransNAS_TSAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, logging, json, joblib
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("TransformerAnomalyDetection")

# Define a simple Transformer-based autoencoder model
class AnomalyTransformer(nn.Module):
    def __init__(self, feature_size, seq_length, num_layers=2, nhead=4, dim_feedforward=128, dropout=0.1):
        super(AnomalyTransformer, self).__init__()
        self.seq_length = seq_length
        self.feature_size = feature_size

        self.input_projection = nn.Linear(feature_size, dim_feedforward)
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_feedforward, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.output_projection = nn.Linear(dim_feedforward, feature_size)

    def forward(self, x):
        # x shape: (batch_size, seq_length, feature_size)
        x = self.input_projection(x)  # -> (batch_size, seq_length, dim_feedforward)
        # Transformer expects (seq_length, batch_size, d_model)
        x = x.permute(1, 0, 2)
        x = self.transformer_encoder(x)
        x = x.permute(1, 0, 2)
        x = self.output_projection(x)
        return x

# Transformer-based anomaly detector class
class TransformerAnomalyDetector:
    def __init__(self, config):
        self.config = config
        self.data = None
        self.preprocessed_data = None
        self.train_data = None
        self.test_data = None
        self.model = None
        self.scaler = None
        self.threshold = None

        os.makedirs(config['model_dir'], exist_ok=True)
        os.makedirs(config['plot_dir'], exist_ok=True)
        os.makedirs(config['report_dir'], exist_ok=True)

    def load_data(self, csv_path):
        logger.info(f"Loading data from {csv_path}")
        try:
            self.data = pd.read_csv(csv_path)
            if 'dt_time' in self.data.columns:
                self.data['dt_time'] = pd.to_datetime(self.data['dt_time'])
                self.data.set_index('dt_time', inplace=True)
            logger.info(f"Data loaded. Shape: {self.data.shape}")
            return self.data
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise

    def preprocess_data(self):
        logger.info("Preprocessing data")
        if self.data is None:
            logger.error("No data loaded")
            return
        df = self.data.copy()
        # Select specified parameters if provided
        params = self.config.get('params', [])
        if params and all(p in df.columns for p in params):
            logger.info(f"Selecting parameters: {params}")
            df = df[params]
        elif params:
            available = [p for p in params if p in df.columns]
            logger.warning(f"Some parameters not found. Using available: {available}")
            df = df[available]
        # Handle missing values
        if df.isnull().sum().sum() > 0:
            logger.info("Filling missing values")
            df.ffill(inplace=True)
            df.bfill(inplace=True)
        # Remove duplicate timestamps
        if df.index.duplicated().any():
            logger.info("Removing duplicate timestamps")
            df = df[~df.index.duplicated()]
        # Resample if needed
        if self.config.get('resample', False):
            freq = self.config.get('resample_freq', '1h')
            logger.info(f"Resampling data to {freq}")
            df = df.resample(freq).mean()
            df.ffill(inplace=True)
            df.bfill(inplace=True)
        self.preprocessed_data = df
        logger.info(f"Preprocessed data shape: {df.shape}")
        # Time-based train-test split
        train_ratio = self.config.get('train_ratio', 0.8)
        split_index = int(len(df) * train_ratio)
        self.train_data = df.iloc[:split_index]
        self.test_data = df.iloc[split_index:]
        logger.info(f"Train shape: {self.train_data.shape}, Test shape: {self.test_data.shape}")
        return df

    def _create_sequences(self, data, seq_length):
        X = []
        for i in range(len(data) - seq_length):
            X.append(data[i:i+seq_length])
        return np.array(X)

    def _build_transformer_model(self, input_shape):
        feature_size = input_shape[2]
        seq_length = input_shape[1]
        num_layers = self.config.get('num_layers', 2)
        nhead = self.config.get('nhead', 4)
        dim_feedforward = self.config.get('dim_feedforward', 128)
        dropout = self.config.get('dropout', 0.1)
        model = AnomalyTransformer(feature_size, seq_length, num_layers, nhead, dim_feedforward, dropout)
        logger.info("Transformer model built:")
        logger.info(model)
        return model.to(device)

    def train_transformer_autoencoder(self):
        logger.info("Training Transformer Autoencoder")
        if self.train_data is None:
            logger.error("No training data available")
            return
        df_train = self.train_data.copy()
        self.scaler = StandardScaler()
        scaled_train = self.scaler.fit_transform(df_train)
        seq_length = self.config.get('sequence_length', 24)
        X_train = self._create_sequences(scaled_train, seq_length)
        logger.info(f"Created {len(X_train)} training sequences")
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)

        self.model = self._build_transformer_model(X_train.shape)
        optimizer = optim.Adam(self.model.parameters(), lr=self.config.get('learning_rate', 1e-3))
        criterion = nn.MSELoss()
        epochs = self.config.get('epochs', 50)
        batch_size = self.config.get('batch_size', 32)

        train_losses = []
        self.model.train()
        for epoch in range(epochs):
            permutation = torch.randperm(X_train_tensor.size(0))
            epoch_loss = 0
            for i in range(0, X_train_tensor.size(0), batch_size):
                indices = permutation[i:i+batch_size]
                batch_x = X_train_tensor[indices]
                optimizer.zero_grad()
                outputs = self.model(batch_x)
                loss = criterion(outputs, batch_x)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item() * batch_x.size(0)
            epoch_loss /= X_train_tensor.size(0)
            train_losses.append(epoch_loss)
            logger.info(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.6f}")
        self._plot_training_history(train_losses)

        # Save model and scaler
        model_path = os.path.join(self.config['model_dir'], 'transformer_autoencoder.pth')
        scaler_path = os.path.join(self.config['model_dir'], 'transformer_scaler.joblib')
        torch.save(self.model.state_dict(), model_path)
        joblib.dump(self.scaler, scaler_path)
        logger.info(f"Model saved to {model_path}")

        # Compute reconstruction errors on training data to set dynamic threshold
        self.model.eval()
        with torch.no_grad():
            reconstructions = self.model(X_train_tensor)
            mse = torch.mean((X_train_tensor - reconstructions)**2, dim=(1,2)).cpu().numpy()
        factor = self.config.get('anomaly_threshold_factor', 3)
        self.threshold = np.mean(mse) + factor * np.std(mse)
        logger.info(f"Anomaly threshold set to {self.threshold}")
        return self.model

    def detect_anomalies(self, data=None):
        logger.info("Detecting anomalies using Transformer Autoencoder")
        if data is not None:
            df = data.copy()
        elif self.test_data is not None:
            df = self.test_data.copy()
        else:
            logger.error("No data provided for detection")
            return None

        scaled_data = self.scaler.transform(df)
        seq_length = self.config.get('sequence_length', 24)
        X = self._create_sequences(scaled_data, seq_length)
        X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
        self.model.eval()
        with torch.no_grad():
            reconstructions = self.model(X_tensor)
            mse = torch.mean((X_tensor - reconstructions)**2, dim=(1,2)).cpu().numpy()
        threshold = self.threshold if self.threshold is not None else np.percentile(mse, 100 - self.config.get('anomaly_percent', 1))
        logger.info(f"Using anomaly threshold: {threshold}")

        # Build results DataFrame (accounting for sequence length)
        result = pd.DataFrame(index=df.index[seq_length:])
        result['reconstruction_error'] = mse
        result['anomaly'] = mse > threshold
        for col in df.columns:
            result[col] = df[col].values[seq_length:]
        anomaly_count = result['anomaly'].sum()
        logger.info(f"Detected {anomaly_count} anomalies out of {len(result)} points")
        self._plot_anomaly_results(result)
        result_path = os.path.join(self.config['report_dir'], 'transformer_anomalies.csv')
        result.to_csv(result_path)
        logger.info(f"Anomaly detection results saved to {result_path}")
        return result

    def _plot_training_history(self, losses):
        plt.figure(figsize=(10,6))
        plt.plot(losses, label='Training Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Loss History')
        plt.legend()
        plt.savefig(os.path.join(self.config['plot_dir'], 'transformer_training_history.png'))
        plt.close()

    def _plot_anomaly_results(self, result_df):
        plt.figure(figsize=(15,8))
        # Plot reconstruction error with threshold
        plt.subplot(2,1,1)
        plt.plot(result_df.index, result_df['reconstruction_error'], label='Reconstruction Error')
        plt.axhline(y=self.threshold, color='r', linestyle='--', label='Threshold')
        plt.title('Reconstruction Error Over Time')
        plt.xlabel('Time')
        plt.ylabel('Error')
        plt.legend()
        # Plot one parameter with anomalies highlighted
        plt.subplot(2,1,2)
        params = self.config.get('params', [col for col in result_df.columns if col not in ['reconstruction_error', 'anomaly']])
        if params:
            param = params[0]
            plt.plot(result_df.index, result_df[param], label=param)
            anomalies = result_df[result_df['anomaly']]
            plt.scatter(anomalies.index, anomalies[param], color='red', label='Anomaly', s=50)
            plt.title(f"{param} with Detected Anomalies")
            plt.xlabel("Time")
            plt.ylabel(param)
            plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(self.config['plot_dir'], 'transformer_anomalies_overview.png'))
        plt.close()

    def analyze_anomalies(self, result_df):
        if result_df is None or result_df.empty:
            logger.error("No anomaly results to analyze")
            return
        logger.info("Analyzing anomalies")
        anomalies = result_df[result_df['anomaly']]
        summary = {
            'total_points': len(result_df),
            'anomaly_count': int(anomalies.shape[0]),
            'anomaly_percent': float((anomalies.shape[0] / len(result_df)) * 100)
        }
        summary_path = os.path.join(self.config['report_dir'], 'transformer_anomaly_summary.json')
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=4)
        logger.info(f"Anomaly summary saved to {summary_path}")
        return summary

def main():
    # Configuration similar to your LSTM example
    config = {
        'params': ['pm2.5cnc', 'pm10cnc'],   # Adjust parameter names as per your CSV
        'resample': True,
        'resample_freq': '1h',
        'sequence_length': 24,               # Sequence length (e.g., 24 timesteps)
        'epochs': 50,
        'batch_size': 32,
        'num_layers': 2,
        'nhead': 4,
        'dim_feedforward': 128,
        'dropout': 0.1,
        'learning_rate': 1e-3,
        'anomaly_threshold_factor': 3,       # e.g., threshold = mean + 3*std on training error
        'train_ratio': 0.8,
        'model_dir': 'transformer_models',
        'plot_dir': 'transformer_plots',
        'report_dir': 'transformer_reports'
    }

    detector = TransformerAnomalyDetector(config)
    detector.load_data('/content/drive/MyDrive/WCE/air_quality_data.csv')
    detector.preprocess_data()
    detector.train_transformer_autoencoder()
    anomaly_results = detector.detect_anomalies()
    detector.analyze_anomalies(anomaly_results)
    logger.info("Transformer anomaly detection completed successfully")

if __name__ == "__main__":
    main()


Using device: cuda




In [3]:
import shutil
import os

# Define the local directories (as used in your config)
local_model_dir = 'transformer_models'
local_plot_dir = 'transformer_plots'
local_report_dir = 'transformer_reports'

# Define the destination folder in your Google Drive
drive_destination = '/content/drive/MyDrive/AnomalyDetectionResults'

# Create destination directories if they don't exist
os.makedirs(os.path.join(drive_destination, 'Models'), exist_ok=True)
os.makedirs(os.path.join(drive_destination, 'Plots'), exist_ok=True)
os.makedirs(os.path.join(drive_destination, 'Reports'), exist_ok=True)

# Copy the directories to the Drive folder
shutil.copytree(local_model_dir, os.path.join(drive_destination, 'Models'), dirs_exist_ok=True)
shutil.copytree(local_plot_dir, os.path.join(drive_destination, 'Plots'), dirs_exist_ok=True)
shutil.copytree(local_report_dir, os.path.join(drive_destination, 'Reports'), dirs_exist_ok=True)

print("Files have been copied to your Google Drive folder:", drive_destination)


Files have been copied to your Google Drive folder: /content/drive/MyDrive/AnomalyDetectionResults
