In [1]:
# ===== IMPORTS AND ENVIRONMENT SETUP =====
# This section imports all necessary libraries and configures the environment

# System and file handling libraries
import os          # Operating system operations (file paths, directories)
import re          # Regular expressions for text pattern matching
import time        # Time tracking for performance monitoring
import json        # JSON file reading/writing
import logging     # Logging framework for tracking progress
import joblib      # Saving/loading Python objects (like models)
import shutil      # High-level file operations
import collections # Specialized container datatypes (Counter, etc.)
import gc          # Garbage collector for memory management

# Numerical and data processing
import numpy as np       # Numerical arrays and operations
import pandas as pd      # Dataframes for tabular data
import matplotlib.pyplot as plt  # Plotting and visualization
import seaborn as sns    # Statistical data visualization

# PyTorch - Deep Learning Framework
import torch             # Main PyTorch module
import torch.nn as nn    # Neural network layers and modules
import torch.optim as optim  # Optimization algorithms (Adam, SGD, etc.)
from torch.utils.data import DataLoader, TensorDataset  # Data loading utilities

# Machine Learning preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer  # Text to numerical features
from sklearn.preprocessing import StandardScaler  # Feature scaling

# Parallel processing and progress tracking
from concurrent.futures import ThreadPoolExecutor, as_completed  # Multi-threading
from pathlib import Path    # Modern file path handling
from tqdm.auto import tqdm  # Progress bars

# ===== LOGGING SETUP =====
# Configure logging to track what the notebook is doing
# Level INFO = show informational messages, warnings, and errors
# Format = timestamp - level - message
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ===== CHECKPOINT CONFIGURATION =====
# Checkpoints allow us to save intermediate results and avoid recomputation
# If the notebook crashes, we can resume from the last checkpoint
CHECKPOINT_DIR = "checkpoints"
if not os.path.exists(CHECKPOINT_DIR):
    os.makedirs(CHECKPOINT_DIR)
    logger.info(f"‚úÖ Created checkpoint directory: {CHECKPOINT_DIR}")

# ===== CONSTANTS =====
# Define all file paths and global settings in one place
BASE_DIR = "data/kafka"                      # Where raw log files are stored
MODEL_PATH = "data/anomaly_autoencoder.pth"  # Where to save the final trained model
VECTORIZER_PATH = "data/tfidf_vectorizer.pkl"  # TF-IDF model path (if used)
SCALER_PATH = "data/scaler.pkl"              # Data scaler path (if used)
SEED = 42  # Random seed for reproducibility (same results every run)

# ===== REPRODUCIBILITY SETUP =====
# Set random seeds so that results are consistent across runs
# This ensures that random initialization, shuffling, etc. produce the same results
torch.manual_seed(SEED)          # PyTorch CPU random seed
np.random.seed(SEED)             # NumPy random seed
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)           # PyTorch GPU random seed
    torch.backends.cudnn.deterministic = True  # Force deterministic GPU operations
    torch.backends.cudnn.benchmark = False     # Disable auto-tuning for consistency

# ===== DEVICE CONFIGURATION =====
# Check if GPU is available and configure accordingly
# GPU (CUDA) = much faster training, CPU = slower but always available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using Computation Device: {DEVICE}")

# If GPU is available, print GPU information
if torch.cuda.is_available():
    logger.info(f"GPU: {torch.cuda.get_device_name(0)}")  # GPU model name
    logger.info(f"CUDA Version: {torch.version.cuda}")     # CUDA version
    logger.info(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    
    # Enable TF32 (TensorFloat-32) for better performance on modern GPUs (RTX 30/40 series)
    # TF32 provides faster computation with minimal accuracy loss
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True


2026-01-02 21:57:59,091 - INFO - Using Computation Device: cuda
2026-01-02 21:57:59,095 - INFO - GPU: NVIDIA GeForce RTX 4090 Laptop GPU
2026-01-02 21:57:59,095 - INFO - CUDA Version: 12.4
2026-01-02 21:57:59,095 - INFO - Total GPU Memory: 17.17 GB


# Unsupervised Log Anomaly Detection Pipeline

This notebook implements an end-to-end pipeline to detect anomalies in logs without labeled data (Unsupervised Learning).

## Architecture
1. **Ingestion**: Load raw logs from text files.
2. **Cleaning (Optimized)**: Uses **RAPIDS (GPU)** if available, or **Dask (Multi-core CPU)** to clean logs via Regex.
3. **Vectorization**: TF-IDF to convert log text into numerical vectors.
4. **Modeling**: A PyTorch **Autoencoder** learns the "normal" structure of logs.
5. **Anomaly Scoring**: The reconstruction error (MSE) serves as the anomaly score. High error = Rare/Abnormal log.

## Checkpoint System

This notebook includes automatic checkpointing to save intermediate results:
- **data.pkl**: Raw loaded data
- **cleaned_data.pkl**: Cleaned logs after regex processing
- **vocab.json**: Vocabulary dictionary
- **encoded_logs.npy**: Encoded input IDs matrix

Checkpoints are stored in `./checkpoints/` directory. Delete individual checkpoint files to force recomputation of that step.


In [2]:
import os
import re
import time
import json
import logging
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor

# Logging Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
BASE_DIR = "data/kafka"
MODEL_PATH = "data/anomaly_autoencoder.pth"
VECTORIZER_PATH = "data/tfidf_vectorizer.pkl"
SCALER_PATH = "data/scaler.pkl"
SEED = 42

# Set seeds for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Check Device and GPU Info
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using Computation Device: {DEVICE}")
if torch.cuda.is_available():
    logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
    logger.info(f"CUDA Version: {torch.version.cuda}")
    logger.info(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    # Enable TF32 for better performance on Ampere+ GPUs
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

2026-01-02 21:57:59,117 - INFO - Using Computation Device: cuda
2026-01-02 21:57:59,117 - INFO - GPU: NVIDIA GeForce RTX 4090 Laptop GPU
2026-01-02 21:57:59,119 - INFO - CUDA Version: 12.4
2026-01-02 21:57:59,119 - INFO - Total GPU Memory: 17.17 GB


## 1. Load and Preprocess Data


In [3]:
def read_file_to_df(path):
    """
    Reads a single text file directly into a DataFrame.
    
    Purpose: Convert a raw log file (.txt) into a pandas DataFrame with one column 'raw_log'.
    Each line in the file becomes a row in the DataFrame.
    
    Args:
        path: Path to the log file
    
    Returns:
        DataFrame with column 'raw_log' containing the log lines
    """
    try:
        return pd.read_csv(
            path, 
            header=None,              # File has no header row
            names=['raw_log'],        # Name the column 'raw_log'
            sep='\t',                 # Tab-separated (adjust if different)
            engine='python',          # Use Python engine for flexible parsing
            dtype='str',              # Treat all data as strings
            encoding='utf-8',         # UTF-8 encoding for special characters
            encoding_errors='ignore', # Ignore problematic characters instead of crashing
            on_bad_lines='skip'       # Skip malformed lines instead of crashing
        )
    except Exception as e:
        logger.error(f"Error reading {path}: {e}")
        # Return empty DataFrame if file cannot be read
        return pd.DataFrame(columns=['raw_log'])

def load_data(base_dir):
    """
    Loads all log files from base_dir in parallel using multi-threading.
    
    Purpose: Efficiently load thousands of .txt files simultaneously using multiple CPU cores.
    This is much faster than loading files one by one.
    
    Workflow:
        1. Find all .txt files in the directory tree
        2. Read each file in parallel using ThreadPoolExecutor
        3. Combine all DataFrames into one large DataFrame
    
    Args:
        base_dir: Root directory containing log files
    
    Returns:
        DataFrame with all logs combined
    """
    if not os.path.exists(base_dir):
        # If data directory doesn't exist, create dummy data for testing
        logger.warning("Data dir not found. Generating dummy logs.")
        return pd.DataFrame({'raw_log': [
            "2023-10-27T10:00:00.123Z [1234] PID 9999 INFO Connection established",
            "2023-10-27T10:00:01.000Z [1234] PID 9999 ERROR Connection refused 10.0.0.1"
        ] * 5000})  # Repeat dummy logs 5000 times

    # Step 1: Find all .txt files recursively
    base_path = Path(base_dir)
    files = list(base_path.rglob('*.txt'))  # rglob = recursive glob (searches subdirectories)
    
    logger.info(f"Found {len(files)} .txt files.")

    # Step 2: Read files in parallel using thread pool
    dfs = []  # List to collect DataFrames from each file
    
    # ThreadPoolExecutor creates a pool of worker threads for parallel execution
    with ThreadPoolExecutor() as executor:
        # Submit all file read tasks to the executor
        future_to_path = {executor.submit(read_file_to_df, f): f for f in files}
        
        # Process completed tasks as they finish (not in submission order)
        for future in tqdm(as_completed(future_to_path), total=len(files), desc="Reading Files"):
            try:
                df = future.result()  # Get the DataFrame from completed task
                dfs.append(df)
            except Exception as e:
                logger.error(f"Failed to process file: {e}")

    # Step 3: Efficiently combine all DataFrames into one
    if dfs:
        logger.info("Concatenating DataFrames...")
        
        with tqdm(total=1, desc="Merging Data") as pbar:
            # pd.concat combines DataFrames vertically (stacking rows)
            # ignore_index=True renumbers rows from 0 to N
            df = pd.concat(dfs, ignore_index=True)
            pbar.update(1)
            
    else:
        # No files were successfully read
        df = pd.DataFrame(columns=['raw_log'])

    return df


In [4]:
# Load data from scratch (skip pickle to avoid MemoryError with large datasets)
# NOTE: We don't save to pickle because the file becomes too large (>16GB)
# Instead, we work directly with encoded_logs.npy which is memory-mapped
logger.info(f"üîÑ Loading data from source files...")
df = load_data(BASE_DIR)
logger.info(f"‚úÖ Loaded {len(df)} logs from source files.")


2026-01-02 21:57:59,203 - INFO - üîÑ Loading data from source files...
2026-01-02 21:57:59,261 - INFO - Found 12302 .txt files.


Reading Files:   0%|          | 0/12302 [00:00<?, ?it/s]

2026-01-02 22:34:45,133 - INFO - Concatenating DataFrames...


Merging Data:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

## 2. Data Cleaning (GPU/CPU Hybrid)
We implement the cleaning logic with a fallback mechanism defined in previous steps.

In [None]:
logger.info(f"Data shape: {df.shape}")
df.head()


2026-01-02 15:23:42,360 - INFO - Data shape: (246979255, 1)


Unnamed: 0,raw_log
0,builder: mozilla-esr52_win7_vm-debug_test-moch...
1,slave: t-w732-spot-130
2,starttime: 1527808669.5
3,results: success (0)
4,buildid: 20180531145517


In [None]:
# ===== DATA CLEANING PATTERNS =====
# These patterns replace variable parts of logs with standardized tokens
# This helps the model focus on the structure rather than specific values

# Why clean logs?
# - Raw logs contain timestamps, IPs, IDs that change constantly
# - "2023-10-27 ERROR" and "2023-10-28 ERROR" should be treated as the same pattern
# - By replacing variables with tokens, we reduce vocabulary size and improve learning

PATTERNS_PY = {
    # Replace ISO8601 timestamps like "2023-10-27T10:00:00.123Z" with [TIMESTAMP_ISO]
    r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z": "[TIMESTAMP_ISO]",
    
    # Replace standard timestamps like "2023-10-27 10:00:00" with [TIMESTAMP]
    r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}": "[TIMESTAMP]",
    
    # Replace "PID 1234" with "PID [PID_NUM]"
    r"PID\s+\d+": "PID [PID_NUM]",
    
    # Replace bracketed numbers like [1234] with [ID]
    r"\[\d+\]": "[ID]",
    
    # Replace IP addresses like "192.168.1.1" with [IP_ADDR]
    r"(?:\d{1,3}\.){3}\d{1,3}": "[IP_ADDR]",
    
    # Replace multiple spaces with single space for consistency
    r"\s+": " "
}

def smart_clean_to_disk(df, batch_size=50_000, output_file=None):
    """
    Cleans log data in batches and saves DIRECTLY to disk to avoid memory overflow.
    
    Purpose: Process millions of logs without running out of RAM.
    Instead of loading all cleaned logs in memory, we write them incrementally to a CSV file.
    
    Workflow:
        1. Process data in small batches (50,000 rows at a time)
        2. Apply regex patterns to clean each batch
        3. Immediately write cleaned batch to disk
        4. Repeat until all data is processed
        5. Read back the complete cleaned CSV
    
    GPU Acceleration:
        - If GPU is available and supports text operations, use it
        - Otherwise, fall back to CPU (regex operations)
    
    Args:
        df: DataFrame with 'raw_log' column
        batch_size: Number of rows to process at once (smaller = less memory)
        output_file: Where to save cleaned data (default: checkpoints/temp_cleaned.csv)
    
    Returns:
        Series containing cleaned logs
    """
    if output_file is None:
        output_file = f"{CHECKPOINT_DIR}/temp_cleaned.csv"
    
    # Check if GPU is available for text processing
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    use_gpu = False
    
    if device.type == "cuda":
        try:
            # Test if PyTorch supports GPU string operations
            # (Not all PyTorch versions/platforms support this)
            _ = torch.string
            use_gpu = True
            logger.info(f"üî• GPU detected and will be used for text cleaning")
        except AttributeError:
            logger.warning("‚ö†Ô∏è Windows detected: GPU not supported for text. Using CPU.")
    
    total_rows = len(df)
    
    # Delete existing cleaned file if it exists (start fresh)
    if os.path.exists(output_file):
        os.remove(output_file)
    
    # Create CSV file with header
    pd.DataFrame(columns=['cleaned_log']).to_csv(output_file, index=False)
    
    # Process data in batches with progress bar
    with tqdm(total=total_rows, unit="rows", desc="Cleaning") as pbar:
        for i in range(0, total_rows, batch_size):
            batch_end = min(i + batch_size, total_rows)
            
            # Extract current batch
            chunk = df['raw_log'].iloc[i:batch_end].fillna('').astype(str)
            
            # Try GPU processing first
            if use_gpu:
                try:
                    # Convert to GPU tensor
                    logs_tensor = torch.as_tensor(chunk.tolist(), dtype=torch.string, device=device)
                    cleaned_tensor = logs_tensor
                    
                    # Apply all regex patterns on GPU
                    for pat, repl in PATTERNS_PY.items():
                        cleaned_tensor = torch.strings.regex_replace(cleaned_tensor, pat, repl)
                    
                    # Convert back to pandas Series
                    cleaned_series = pd.Series(cleaned_tensor.tolist(), name='cleaned_log')
                    
                except Exception as e:
                    # GPU processing failed, fall back to CPU
                    logger.warning(f"GPU error ({e}), falling back to CPU...")
                    use_gpu = False
            
            # CPU processing (or GPU fallback)
            if not use_gpu:
                # Apply regex patterns sequentially using pandas string methods
                for pat, repl in PATTERNS_PY.items():
                    chunk = chunk.str.replace(pat, repl, regex=True)
                cleaned_series = chunk.rename('cleaned_log')
            
            # Append cleaned batch to CSV file (append mode = 'a')
            cleaned_series.to_csv(output_file, mode='a', header=False, index=False)
            
            # Update progress bar
            pbar.update(batch_end - i)
    
    logger.info(f"‚úÖ Cleaning complete. File saved: {output_file}")
    
    # Read the complete cleaned file back into memory
    logger.info("üîÑ Reading cleaned file back into memory...")
    df_cleaned = pd.read_csv(output_file)

    return df_cleaned['cleaned_log']


## Data Cleaning Function Details

The `smart_clean_to_disk` function:
1. **Objective**: Clean a column of logs (raw_log) by replacing variable elements (dates, IP, IDs) with standardized tokens ([TIMESTAMP], etc.)
2. **Batch Processing**: Processes data in chunks to avoid memory saturation
3. **GPU Fallback**: Automatically switches to CPU processing if GPU text processing fails
4. **Progress Tracking**: Real-time progress bar with tqdm


In [None]:
logger.info(f"Data shape before cleaning: {df.shape}")


2026-01-02 15:23:42,500 - INFO - Data shape before cleaning: (246979255, 1)


In [None]:
# Load or compute cleaned logs
# NOTE: Skip pickle checkpoint (too large). Load from CSV temp file if available
checkpoint_file_csv = f"{CHECKPOINT_DIR}/temp_cleaned.csv"
if os.path.exists(checkpoint_file_csv):
    try:
        logger.info(f"üìÇ Loading cached cleaned data from CSV...")
        df['cleaned_log'] = pd.read_csv(checkpoint_file_csv)['cleaned_log']
        logger.info(f"‚úÖ Loaded cleaned data from CSV checkpoint.")
    except Exception as e:
        logger.warning(f"‚ö†Ô∏è Failed to load CSV ({e}). Recomputing...")
        logger.info(f"üîÑ Computing cleaned logs from scratch...")
        df['cleaned_log'] = smart_clean_to_disk(df)
else:
    logger.info(f"üîÑ Computing cleaned logs from scratch...")
    df['cleaned_log'] = smart_clean_to_disk(df)


2026-01-02 15:23:42,521 - INFO - üìÇ Loading cached cleaned data from CSV...
2026-01-02 15:32:24,184 - INFO - ‚úÖ Loaded cleaned data from CSV checkpoint.


In [None]:
logger.info(f"Data shape after cleaning: {df.shape}")
# df[['raw_log', 'cleaned_log']].head()


2026-01-02 15:32:24,332 - INFO - Data shape after cleaning: (246979255, 2)


## 3. Build Vocabulary from Cleaned Logs


In [None]:
# ===== VOCABULARY CONFIGURATION =====
# The vocabulary is a dictionary mapping words to unique integer IDs
# This allows us to convert text into numbers that the neural network can process

VOCAB_SIZE_LIMIT = 50000    # Maximum words in dictionary (top 50k most common)
MIN_OCCURRENCES = 5         # Ignore rare words (appearing less than 5 times)
SAMPLE_SIZE = None          # None = Use ALL rows, or set a number for faster testing
TEXT_COLUMN = 'cleaned_log' # Which column contains the text to analyze
OUTPUT_FILE = f"{CHECKPOINT_DIR}/vocab_full.json"  # Where to save vocabulary

# Special tokens
PAD_TOKEN = "<PAD>"  # Padding token (ID: 0) - used to fill short sequences to max length
UNK_TOKEN = "<UNK>"  # Unknown token (ID: 1) - used for words not in vocabulary

def build_vocabulary(df):
    """
    Builds a complete vocabulary from the cleaned log data.
    
    Purpose: Create a word ‚Üí ID mapping for converting text to numerical format.
    Only keep words that appear frequently enough to be meaningful.
    
    Workflow:
        1. Count how many times each word appears in the entire dataset
        2. Filter out rare words (noise/typos)
        3. Keep only the top N most common words
        4. Create bidirectional dictionaries (word‚ÜíID and ID‚Üíword)
        5. Save to JSON for reuse
    
    Args:
        df: DataFrame containing cleaned logs
    
    Returns:
        vocab_dict: Dictionary mapping words to IDs
    """
    print(f"üîé Starting vocabulary construction...")
    
    # Step A: Data sampling (optional)
    if SAMPLE_SIZE and len(df) > SAMPLE_SIZE:
        print(f"‚ö° Sampling: Using {SAMPLE_SIZE} random rows (faster testing).")
        work_df = df.sample(n=SAMPLE_SIZE, random_state=42)
    else:
        print(f"‚ö° FULL MODE: Processing all {len(df):,} rows.")
        work_df = df

    # Step B: Count word occurrences
    # Counter is a specialized dictionary for counting hashable objects
    counter = collections.Counter()
    
    print("üìä Analyzing words in progress...")
    
    # Iterate through each log and count words
    for text in tqdm(work_df[TEXT_COLUMN], desc="Scanning Logs"):
        if isinstance(text, str):
            tokens = text.split()  # Split on whitespace
            counter.update(tokens)  # Add word counts to counter

    print(f"\n‚úÖ Analysis complete. Found {len(counter):,} unique words.")

    # Step C: Filtering and Sorting
    print(f"‚úÇÔ∏è Filtering: Keeping words that appear > {MIN_OCCURRENCES} times...")
    
    # Get the most common words, limited by VOCAB_SIZE_LIMIT
    # Subtract 2 to leave room for PAD and UNK tokens
    most_common = counter.most_common(VOCAB_SIZE_LIMIT - 2)
    
    # Filter out rare words
    filtered_vocab = [word for word, count in most_common if count >= MIN_OCCURRENCES]
    
    print(f"‚úÖ Final vocabulary: {len(filtered_vocab):,} words.")

    # Step D: Build ID ‚Üî Word dictionaries
    # vocab_list = [PAD, UNK, word1, word2, ...]
    vocab_list = [PAD_TOKEN, UNK_TOKEN] + filtered_vocab
    
    # Create word ‚Üí ID mapping (for encoding)
    vocab_dict = {word: idx for idx, word in enumerate(vocab_list)}
    
    # Create ID ‚Üí word mapping (for decoding)
    inverse_vocab_dict = {idx: word for word, idx in vocab_dict.items()}
    
    # Step E: Save to JSON for future use
    vocab_data = {
        "vocab": vocab_dict,           # word ‚Üí ID
        "inverse_vocab": inverse_vocab_dict,  # ID ‚Üí word
        "size": len(vocab_list),       # Total vocabulary size
        "pad_token": PAD_TOKEN,        # What token is used for padding
        "unk_token": UNK_TOKEN,        # What token is used for unknowns
        "min_occurrences": MIN_OCCURRENCES  # Filtering threshold
    }
    
    print(f"üíæ Saving to '{OUTPUT_FILE}'...")
    with open(OUTPUT_FILE, "w") as f:
        json.dump(vocab_data, f, indent=4)
        
    print("‚úÖ Vocabulary saved successfully!")
    
    # Print top 10 words (excluding special tokens)
    print("\n--- TOP 10 MOST COMMON WORDS ---")
    for word, idx in list(vocab_dict.items())[2:12]:  # Skip PAD and UNK
        print(f"{word:20s} (ID: {idx})")

    return vocab_dict


In [None]:
# Load or compute vocabulary
if os.path.exists(OUTPUT_FILE):
    logger.info(f"üìÇ Loading cached vocabulary from checkpoint...")
    with open(OUTPUT_FILE, 'r') as f:
        vocab_data = json.load(f)
    vocab = vocab_data['vocab']
    logger.info(f"‚úÖ Loaded vocabulary from checkpoint (Size: {vocab_data['size']}).")
else:
    logger.info(f"üîÑ Computing vocabulary from scratch...")
    vocab = build_vocabulary(df)
    logger.info(f"üíæ Saved checkpoint: vocab.json")


2026-01-02 15:32:24,422 - INFO - üìÇ Loading cached vocabulary from checkpoint...
2026-01-02 15:32:24,500 - INFO - ‚úÖ Loaded vocabulary from checkpoint (Size: 50000).


In [None]:
# ===== ENCODING CONFIGURATION =====
VOCAB_FILE = f"{CHECKPOINT_DIR}/vocab_full.json"
INPUT_COL = "cleaned_log"
BATCH_SIZE = 10_000       # Traitement par lot pour ne pas exploser la RAM (r√©duit pour √©viter OOM)
MAX_LEN = 128            # Longueur maximale d'un log (tronquer ou pad)

def encode_logs(df):
    """
    Convertit la colonne cleaned_log en matrices NumPy (input_ids)
    pr√™tes pour PyTorch.
    """
    
    # 1. Charger le vocabulaire
    logger.info(f"üìÇ Chargement du vocabulaire depuis {VOCAB_FILE}...")
    with open(VOCAB_FILE, 'r') as f:
        vocab_data = json.load(f)
    
    vocab = vocab_data['vocab']
    unk_id = vocab[vocab_data['unk_token']] # G√©n√©ralement 1
    pad_id = vocab[vocab_data['pad_token']] # G√©n√©ralement 0
    
    logger.info(f"‚úÖ Vocabulaire charg√© (Taille: {vocab_data['size']}).")
    logger.info(f"üîß Configuration: Max_Len={MAX_LEN}, Batch_Size={BATCH_SIZE}")

    all_input_ids = []
    
    total_rows = len(df)
    
    # Fonction optimis√©e pour encoder une seule phrase
    def text_to_ids(text):
        if not isinstance(text, str): return [pad_id]
        return [vocab.get(word, unk_id) for word in text.split()]

    # Boucle de traitement par Batches
    with tqdm(total=total_rows, unit="rows", desc="Encodage") as pbar:
        
        for i in range(0, total_rows, BATCH_SIZE):
            batch_end = min(i + BATCH_SIZE, total_rows)
            
            # Extraire le batch
            chunk_texts = df[INPUT_COL].iloc[i:batch_end]
            
            # Encoder (Map sur la s√©rie)
            encoded_lists = chunk_texts.map(text_to_ids)
            
            # Padding & Truncation
            padded_matrix = np.zeros((len(chunk_texts), MAX_LEN), dtype=np.int32)
            
            for idx, ids in enumerate(encoded_lists):
                seq = ids[:MAX_LEN]
                padded_matrix[idx, :len(seq)] = seq
            
            all_input_ids.append(padded_matrix)
            
            pbar.update(len(chunk_texts))

    # 4. Concat√©nation finale
    logger.info("üß© Concat√©nation des batches...")
    final_matrix = np.concatenate(all_input_ids, axis=0)
    
    logger.info(f"‚úÖ Encodage termin√©. Shape finale: {final_matrix.shape}")
    return final_matrix


In [None]:


# Load or compute encoded logs
checkpoint_path = f"{CHECKPOINT_DIR}/encoded_logs.npy"
if os.path.exists(checkpoint_path):
    logger.info(f"üìÇ Loading cached encoded logs from checkpoint...")
    input_ids_matrix = np.load(checkpoint_path, mmap_mode='r')  # Memory-mapped for large files
    logger.info(f"‚úÖ Loaded encoded logs from checkpoint. Shape: {input_ids_matrix.shape}")
else:
    logger.info(f"üîÑ Computing encoded logs from scratch...")
    
    # Process in smaller batches and save directly to disk
    temp_file = f"{CHECKPOINT_DIR}/encoded_logs_temp.npy"
    
    # First pass: compute and save batches incrementally
    logger.info(f"üìÇ Chargement du vocabulaire depuis {VOCAB_FILE}...")
    with open(VOCAB_FILE, 'r') as f:
        vocab_data_local = json.load(f)
    
    vocab_local = vocab_data_local['vocab']
    unk_id = vocab_local[vocab_data_local['unk_token']]
    pad_id = vocab_local[vocab_data_local['pad_token']]
    
    total_rows = len(df)
    ENCODE_BATCH_SIZE = 5000  # Reduced batch size to prevent OOM
    
    # Create memory-mapped array for incremental writing
    input_ids_matrix = np.lib.format.open_memmap(
        temp_file, 
        mode='w+', 
        dtype=np.int32, 
        shape=(total_rows, MAX_LEN)
    )
    
    def text_to_ids_local(text):
        if not isinstance(text, str): return [pad_id]
        return [vocab_local.get(word, unk_id) for word in text.split()]
    
    with tqdm(total=total_rows, unit="rows", desc="Encodage") as pbar:
        for i in range(0, total_rows, ENCODE_BATCH_SIZE):
            batch_end = min(i + ENCODE_BATCH_SIZE, total_rows)
            chunk_texts = df[INPUT_COL].iloc[i:batch_end]
            encoded_lists = chunk_texts.map(text_to_ids_local)
            
            for idx, ids in enumerate(encoded_lists):
                seq = ids[:MAX_LEN]
                input_ids_matrix[i + idx, :len(seq)] = seq
            
            # Force flush to disk periodically
            if i % (ENCODE_BATCH_SIZE * 10) == 0:
                del input_ids_matrix
                gc.collect()
                input_ids_matrix = np.lib.format.open_memmap(temp_file, mode='r+', dtype=np.int32)
            
            pbar.update(batch_end - i)
    
    # Flush and rename
    del input_ids_matrix
    gc.collect()
    
    shutil.move(temp_file, checkpoint_path)
    logger.info(f"üíæ Saved checkpoint: encoded_logs.npy")
    
    # Reload as memory-mapped
    input_ids_matrix = np.load(checkpoint_path, mmap_mode='r')
    logger.info(f"‚úÖ Encodage termin√©. Shape finale: {input_ids_matrix.shape}")


2026-01-02 15:32:24,536 - INFO - üìÇ Loading cached encoded logs from checkpoint...
2026-01-02 15:32:24,551 - INFO - ‚úÖ Loaded encoded logs from checkpoint. Shape: (246979255, 128)


In [None]:
# ===== CHECK ENCODED DATA SIZE =====
logger.info(f"üîç Checking encoded logs shape and memory usage...")
logger.info(f"Shape: {input_ids_matrix.shape}")
logger.info(f"Dtype: {input_ids_matrix.dtype}")
logger.info(f"Memory usage: {input_ids_matrix.nbytes / 1e9:.2f} GB")
logger.info(f"Total elements: {input_ids_matrix.size:,}")

# If too large, use only first N samples
MAX_SAMPLES_FOR_TRAINING = 500_000  # Reduced from 2M - batch 64 requires ~30 min per epoch otherwise

if len(input_ids_matrix) > MAX_SAMPLES_FOR_TRAINING:
    logger.warning(f"‚ö†Ô∏è DATASET TOO LARGE! Using only first {MAX_SAMPLES_FOR_TRAINING:,} samples")
    logger.info(f"Original: {len(input_ids_matrix):,} samples ‚Üí Now: {MAX_SAMPLES_FOR_TRAINING:,} samples")
    input_ids_matrix = input_ids_matrix[:MAX_SAMPLES_FOR_TRAINING]
    logger.info(f"New shape: {input_ids_matrix.shape}")

2026-01-02 15:32:24,581 - INFO - üîç Checking encoded logs shape and memory usage...
2026-01-02 15:32:24,583 - INFO - Shape: (246979255, 128)
2026-01-02 15:32:24,584 - INFO - Dtype: int32
2026-01-02 15:32:24,584 - INFO - Memory usage: 126.45 GB
2026-01-02 15:32:24,584 - INFO - Total elements: 31,613,344,640
2026-01-02 15:32:24,584 - INFO - Original: 246,979,255 samples ‚Üí Now: 500,000 samples
2026-01-02 15:32:24,584 - INFO - New shape: (500000, 128)


## 4. Model Training

Now that we have encoded logs, we'll train an **Autoencoder** to learn the normal structure of logs.

The model will:
1. **Compress** each log into a latent representation (via LSTM Encoder)
2. **Reconstruct** it back to the original vocabulary distribution (Decoder)
3. Learn by minimizing reconstruction error

After training, normal logs will have low reconstruction error, while anomalies will have high error.


In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    logger.info(f"üßπ GPU cache cleared before model initialization")
import gc
gc.collect()

NameError: name 'torch' is not defined

In [None]:
# ===== DEFINE AUTOENCODER MODEL =====
import torch
import torch.nn as nn
import torch.optim as optim

# 1. D√©finir PAD_IDX (Id√©alement 0 comme dans l'embedding)
PAD_IDX = 0 
VOCAB_SIZE = vocab_data['size']

class LogAutoEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=48, latent_dim=24):
        super(LogAutoEncoder, self).__init__()
        
        # 1. Couche d'Embedding
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # 2. Encodeur LSTM
        # On compresse la s√©quence vers latent_dim (ex: 24)
        self.encoder = nn.LSTM(embed_dim, latent_dim, batch_first=True, num_layers=1)
        
        # 3. D√©codeur LSTM
        # Input size = latent_dim (what we feed it)
        # Hidden size = embed_dim (internal representation)
        self.decoder = nn.LSTM(latent_dim, embed_dim, batch_first=True, num_layers=1)
        
        # 4. Couche de sortie finale
        self.output_linear = nn.Linear(embed_dim, vocab_size)
        
        self.max_len = 128

    def forward(self, x):
        # x shape: [batch_size, seq_len]
        embedded = self.embedding(x) # [batch, seq_len, embed_dim]
        
        # --- ENCODAGE ---
        _, (h_n, c_n) = self.encoder(embedded)
        # h_n shape: [1, batch, latent_dim]
        latent = h_n.squeeze(0) # [batch, latent_dim]
        
        # --- D√âCODAGE ---
        # On doit r√©p√©ter le vecteur latent pour chaque pas de temps
        latent_repeated = latent.unsqueeze(1).expand(-1, self.max_len, -1) # [batch, seq_len, latent_dim]
        
        # On passe au d√©codeur sans √©tat cach√© (il en cr√©e un nouveau)
        decoded, _ = self.decoder(latent_repeated)
        
        # Projection finale vers le vocabulaire
        reconstruction = self.output_linear(decoded) # [batch, seq_len, vocab_size]
        
        return reconstruction, latent

# Clear GPU cache and CPU memory before model init
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    logger.info(f"üßπ GPU cache cleared before model initialization")
import gc
gc.collect()

# Initialize model
logger.info(f"ü§ñ Initializing LogAutoEncoder...")
model = LogAutoEncoder(
    vocab_size=VOCAB_SIZE,
    embed_dim=48,  # Reduced from 64 for faster training
    latent_dim=24  # Reduced from 32 for faster training
).to(DEVICE)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
logger.info(f"üìà Total parameters: {total_params:,}")
logger.info(f"üìà Trainable parameters: {trainable_params:,}")


# Optimizer with higher learning rate for faster convergence
optimizer = optim.Adam(model.parameters(), lr=0.002, weight_decay=1e-5)
logger.info(f"‚úÖ Model ready for training on {DEVICE}")

# Loss function - CrossEntropy (standard for reconstruction)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

2026-01-02 15:32:24,859 - INFO - üßπ GPU cache cleared before model initialization
2026-01-02 15:32:24,964 - INFO - ü§ñ Initializing LogAutoEncoder...
2026-01-02 15:32:25,299 - INFO - üìà Total parameters: 4,871,312
2026-01-02 15:32:25,302 - INFO - üìà Trainable parameters: 4,871,312
2026-01-02 15:32:26,899 - INFO - ‚úÖ Model ready for training on cuda


## 4.1 Training Parameters Optimization for RTX 4090

For faster training on RTX 4090 (24GB VRAM) - Complete in ~2 hours



In [None]:
# ===== OPTIMIZED PARAMETERS FOR RTX 4090 (24GB VRAM) =====
# Target: Complete training in ~2 hours

logger.info("\n" + "="*70)
logger.info("‚ö° RTX 4090 OPTIMIZED TRAINING CONFIGURATION")
logger.info("="*70)

# Check GPU capabilities
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    total_vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    logger.info(f"GPU: {gpu_name}")
    logger.info(f"VRAM: {total_vram:.1f} GB")
    
    if total_vram >= 24:
        logger.info("‚úÖ RTX 4090 Detected! Using aggressive optimization.")
        # RTX 4090 (24GB) Optimized Settings
        EPOCHS_RTX4090 = 10              # Fewer epochs but larger batches
        TRAIN_BATCH_SIZE_RTX4090 = 1024  # Much larger batch (RTX 4090 can handle)
        LEARNING_RATE_RTX4090 = 0.004    # Higher LR for faster convergence
        USE_MIXED_PRECISION_RTX4090 = True
        GRADIENT_ACCUMULATION_STEPS_RTX4090 = 1
        NUM_WORKERS_RTX4090 = 2          # 4 workers for parallel data loading (~45min total vs 2.3h with 0)
        
        logger.info(f"\nüî• Optimized Parameters:")
        logger.info(f"   Epochs: {EPOCHS_RTX4090}")
        logger.info(f"   Batch Size: {TRAIN_BATCH_SIZE_RTX4090}")
        logger.info(f"   Learning Rate: {LEARNING_RATE_RTX4090}")
        logger.info(f"   Mixed Precision: {USE_MIXED_PRECISION_RTX4090}")
        logger.info(f"   Num Workers: {NUM_WORKERS_RTX4090}")
        
        # Calculate estimated time
        num_batches_per_epoch = len(dataset) // TRAIN_BATCH_SIZE_RTX4090
        time_per_batch_ms = 10  # RTX 4090 @ ~100 it/s
        time_per_epoch_min = (num_batches_per_epoch * time_per_batch_ms) / 60000
        total_time_min = time_per_epoch_min * EPOCHS_RTX4090
        
        logger.info(f"\n‚è±Ô∏è  Estimated Training Time:")
        logger.info(f"   Per epoch: ~{time_per_epoch_min:.0f} minutes")
        logger.info(f"   Total ({EPOCHS_RTX4090} epochs): ~{total_time_min:.0f} minutes (~{total_time_min/60:.1f} hours)")
        logger.info(f"   Should complete in: < 2 hours ‚úÖ")
        
    elif total_vram >= 16:
        logger.warning("‚ö†Ô∏è RTX 3090/4080 Detected (16GB). Using moderate optimization.")
        EPOCHS_RTX4090 = 12
        TRAIN_BATCH_SIZE_RTX4090 = 512
        LEARNING_RATE_RTX4090 = 0.003
        USE_MIXED_PRECISION_RTX4090 = True
        GRADIENT_ACCUMULATION_STEPS_RTX4090 = 1
        NUM_WORKERS_RTX4090 = 2
    else:
        logger.warning("‚ö†Ô∏è Limited VRAM detected. Using conservative settings.")
        EPOCHS_RTX4090 = 15
        TRAIN_BATCH_SIZE_RTX4090 = 256
        LEARNING_RATE_RTX4090 = 0.002
        USE_MIXED_PRECISION_RTX4090 = True
        GRADIENT_ACCUMULATION_STEPS_RTX4090 = 1
        NUM_WORKERS_RTX4090 = 0

else:
    logger.warning("‚ùå No GPU detected. Training will be slow!")
    EPOCHS_RTX4090 = 15
    TRAIN_BATCH_SIZE_RTX4090 = 64
    LEARNING_RATE_RTX4090 = 0.002
    USE_MIXED_PRECISION_RTX4090 = False
    GRADIENT_ACCUMULATION_STEPS_RTX4090 = 2
    NUM_WORKERS_RTX4090 = 0


2026-01-02 15:32:26,917 - INFO - 
2026-01-02 15:32:26,917 - INFO - ‚ö° RTX 4090 OPTIMIZED TRAINING CONFIGURATION
2026-01-02 15:32:26,917 - INFO - GPU: NVIDIA GeForce RTX 4090 Laptop GPU
2026-01-02 15:32:26,917 - INFO - VRAM: 17.2 GB


In [None]:
# ===== PREREQUISITE CHECK: ENSURE ALL REQUIRED VARIABLES EXIST =====

logger.info("\n" + "="*70)
logger.info("‚úÖ CHECKING PREREQUISITES FOR TRAINING")
logger.info("="*70)

# Check if dataset exists, if not create it
if 'dataset' not in locals() or 'dataset' not in globals():
    logger.warning("‚ö†Ô∏è Dataset not found! Creating it from encoded logs...")
    
    # Need to ensure LogDataset class is defined
    from torch.utils.data import Dataset
    
    class LogDataset(Dataset):
        def __init__(self, data):
            self.data = data
        
        def __len__(self):
            return len(self.data)
        
        def __getitem__(self, idx):
            return torch.tensor(self.data[idx], dtype=torch.long)
    
    dataset = LogDataset(input_ids_matrix)
    logger.info(f"‚úÖ Created dataset with {len(dataset):,} samples")
else:
    logger.info(f"‚úÖ Dataset already exists: {len(dataset):,} samples")

# Check if model exists
if 'model' not in locals() or 'model' not in globals():
    logger.warning("‚ö†Ô∏è Model not found! Creating a new one...")
    
    # Define the model class
    class LogAutoEncoder(nn.Module):
        def __init__(self, vocab_size, embed_dim=48, latent_dim=24):
            super(LogAutoEncoder, self).__init__()
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
            self.encoder = nn.LSTM(embed_dim, latent_dim, batch_first=True, num_layers=1)
            self.decoder = nn.LSTM(latent_dim, embed_dim, batch_first=True, num_layers=1)
            self.output_linear = nn.Linear(embed_dim, vocab_size)
            self.max_len = 128

        def forward(self, x):
            embedded = self.embedding(x)
            _, (h_n, c_n) = self.encoder(embedded)
            latent = h_n.squeeze(0)
            latent_repeated = latent.unsqueeze(1).expand(-1, self.max_len, -1)
            decoded, _ = self.decoder(latent_repeated)
            reconstruction = self.output_linear(decoded)
            return reconstruction, latent
    
    # Create new model
    model = LogAutoEncoder(
        vocab_size=VOCAB_SIZE,
        embed_dim=48,
        latent_dim=24
    ).to(DEVICE)
    logger.info(f"‚úÖ Created new model with {sum(p.numel() for p in model.parameters()):,} parameters")
else:
    logger.info(f"‚úÖ Model already loaded")

# Check if optimizer exists
if 'optimizer' not in locals() or 'optimizer' not in globals():
    logger.warning("‚ö†Ô∏è Optimizer not found! Creating a new one...")
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE_RTX4090, weight_decay=1e-5)
    logger.info(f"‚úÖ Created optimizer with LR={LEARNING_RATE_RTX4090}")
else:
    logger.info(f"‚úÖ Optimizer already exists")

# Check if criterion exists
if 'criterion' not in locals() or 'criterion' not in globals():
    logger.warning("‚ö†Ô∏è Loss function not found! Creating CrossEntropyLoss...")
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    logger.info(f"‚úÖ Created CrossEntropyLoss criterion")
else:
    logger.info(f"‚úÖ Criterion already exists")

# Check if scheduler exists (optional but recommended)
if 'scheduler' not in locals() or 'scheduler' not in globals():
    logger.warning("‚ö†Ô∏è Scheduler not found! Creating CosineAnnealingLR...")
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS_RTX4090, eta_min=1e-6)
    logger.info(f"‚úÖ Created CosineAnnealingLR scheduler")
else:
    logger.info(f"‚úÖ Scheduler already exists")

logger.info(f"\n‚ú® All prerequisites satisfied! Ready to train.")
logger.info(f"üìä Dataset size: {len(dataset):,}")
logger.info(f"ü§ñ Model parameters: {sum(p.numel() for p in model.parameters()):,}")
logger.info(f"üîß Device: {DEVICE}")


2026-01-02 15:32:26,950 - INFO - 
2026-01-02 15:32:26,950 - INFO - ‚úÖ CHECKING PREREQUISITES FOR TRAINING
2026-01-02 15:32:26,954 - INFO - ‚úÖ Created dataset with 500,000 samples
2026-01-02 15:32:26,956 - INFO - ‚úÖ Model already loaded
2026-01-02 15:32:26,958 - INFO - ‚úÖ Optimizer already exists
2026-01-02 15:32:26,959 - INFO - ‚úÖ Criterion already exists
2026-01-02 15:32:26,959 - INFO - ‚úÖ Created CosineAnnealingLR scheduler
2026-01-02 15:32:26,961 - INFO - 
‚ú® All prerequisites satisfied! Ready to train.
2026-01-02 15:32:26,961 - INFO - üìä Dataset size: 500,000
2026-01-02 15:32:26,963 - INFO - ü§ñ Model parameters: 4,871,312
2026-01-02 15:32:26,963 - INFO - üîß Device: cuda


In [None]:
"""# ===== OPTIMIZED TRAINING WITH RTX 4090 SETTINGS =====

import torch
import torch.nn as nn
import torch.optim as optim
import os
import logging
import time
from tqdm.auto import tqdm
from torch.amp import autocast, GradScaler
from torch.optim.lr_scheduler import CosineAnnealingLR

# Use optimized parameters
EPOCHS = EPOCHS_RTX4090
TRAIN_BATCH_SIZE = TRAIN_BATCH_SIZE_RTX4090
LEARNING_RATE = LEARNING_RATE_RTX4090
USE_MIXED_PRECISION = USE_MIXED_PRECISION_RTX4090
GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS_RTX4090

logger.info(f"\nüöÄ STARTING OPTIMIZED TRAINING")
logger.info(f"="*70)

# Create optimized dataloader
dataloader_optimized = DataLoader(
    dataset, 
    batch_size=TRAIN_BATCH_SIZE, 
    shuffle=True, 
    num_workers=NUM_WORKERS_RTX4090,
    pin_memory=True,
    persistent_workers=True if NUM_WORKERS_RTX4090 > 0 else False
)

logger.info(f"üìä Dataset: {len(dataset):,} samples")
logger.info(f"üì¶ Batch Size: {TRAIN_BATCH_SIZE}")
logger.info(f"üìà Batches per epoch: {len(dataloader_optimized)}")
logger.info(f"‚è±Ô∏è  Estimated epoch time: ~{len(dataloader_optimized) / 100:.1f} minutes")
logger.info(f"üîã Total estimated time: ~{(len(dataloader_optimized) / 100) * EPOCHS / 60:.1f} hours")

# Reinitialize optimizer with new learning rate
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-6)

# GPU optimization flags
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()  # Sync before timing
    
    # Enable cuDNN auto-tuner for maximum performance
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = False
    
    logger.info(f"üî• GPU Optimizations Enabled:")
    logger.info(f"   cuDNN Benchmark: True")
    logger.info(f"   TF32: True")

scaler = GradScaler("cuda") if USE_MIXED_PRECISION else None

train_losses = []
start_time = time.time()

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    num_batches = 0
    epoch_start = time.time()
    
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
    
    pbar = tqdm(
        enumerate(dataloader_optimized),
        total=len(dataloader_optimized),
        desc=f"Epoch {epoch+1}/{EPOCHS}",
        leave=True,
        ncols=120  # Wider progress bar
    )
    
    for batch_idx, batch_data in pbar:
        batch_data = batch_data.to(DEVICE, non_blocking=True)
        
        if USE_MIXED_PRECISION:
            with autocast(device_type="cuda", dtype=torch.float16):
                reconstruction, latent = model(batch_data)
                batch_size, seq_len, vocab_size = reconstruction.shape
                
                reconstruction_flat = reconstruction.view(-1, vocab_size)
                target_flat = batch_data.view(-1)
                loss = criterion(reconstruction_flat, target_flat)
                
                loss = loss / GRADIENT_ACCUMULATION_STEPS
            
            scaler.scale(loss).backward()
        else:
            optimizer.zero_grad()
            reconstruction, latent = model(batch_data)
            batch_size, seq_len, vocab_size = reconstruction.shape
            
            reconstruction_flat = reconstruction.view(-1, vocab_size)
            target_flat = batch_data.view(-1)
            loss = criterion(reconstruction_flat, target_flat)
            loss = loss / GRADIENT_ACCUMULATION_STEPS
            loss.backward()
        
        if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            if USE_MIXED_PRECISION:
                scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            if USE_MIXED_PRECISION:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            
            optimizer.zero_grad()
        
        total_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        num_batches += 1
        
        # Get GPU memory info
        gpu_mem_used = 0
        gpu_mem_total = 0
        if torch.cuda.is_available():
            gpu_mem_used = torch.cuda.memory_allocated(0) / 1e9
            gpu_mem_total = torch.cuda.get_device_properties(0).total_memory / 1e9
        
        # Update progress bar with detailed info
        pbar.set_postfix({
            'loss': f'{loss.item() * GRADIENT_ACCUMULATION_STEPS:.4f}',
            'lr': f'{optimizer.param_groups[0]["lr"]:.6f}',
            f'GPU': f'{gpu_mem_used:.1f}/{gpu_mem_total:.1f}GB',
            'speed': f'{pbar.format_dict["rate"]:.1f}it/s' if pbar.format_dict["rate"] else 'N/A'
        })
    
    scheduler.step()
    
    avg_loss = total_loss / num_batches
    train_losses.append(avg_loss)
    
    epoch_time = time.time() - epoch_start
    current_lr = optimizer.param_groups[0]['lr']
    
    logger.info(f"‚úÖ Epoch {epoch+1}/{EPOCHS} | Loss: {avg_loss:.4f} | LR: {current_lr:.6f} | Time: {epoch_time:.1f}s")
    
    # Save checkpoint
    if (epoch + 1) % 1 == 0:  # Save every epoch
        checkpoint_path = f"{CHECKPOINT_DIR}/autoencoder_rtx4090_epoch_{epoch+1}.pth"
        
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': avg_loss,
            'vocab_size': VOCAB_SIZE,
            'embed_dim': 48,
            'latent_dim': 24,
            'max_len': MAX_LEN,
        }, checkpoint_path)
        logger.info(f"üíæ Saved: {checkpoint_path}")

total_time = time.time() - start_time
logger.info(f"\nüéâ TRAINING COMPLETE!")
logger.info(f"="*70)
logger.info(f"‚è±Ô∏è  Total time: {total_time/60:.1f} minutes ({total_time/3600:.2f} hours)")
logger.info(f"üìä Final loss: {train_losses[-1]:.4f}")
if torch.cuda.is_available():
    logger.info(f"üñ•Ô∏è Peak GPU Memory: {torch.cuda.max_memory_allocated(0) / 1e9:.2f} GB")"""


'# ===== OPTIMIZED TRAINING WITH RTX 4090 SETTINGS =====\n\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport os\nimport logging\nimport time\nfrom tqdm.auto import tqdm\nfrom torch.amp import autocast, GradScaler\nfrom torch.optim.lr_scheduler import CosineAnnealingLR\n\n# Use optimized parameters\nEPOCHS = EPOCHS_RTX4090\nTRAIN_BATCH_SIZE = TRAIN_BATCH_SIZE_RTX4090\nLEARNING_RATE = LEARNING_RATE_RTX4090\nUSE_MIXED_PRECISION = USE_MIXED_PRECISION_RTX4090\nGRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS_RTX4090\n\nlogger.info(f"\nüöÄ STARTING OPTIMIZED TRAINING")\nlogger.info(f"="*70)\n\n# Create optimized dataloader\ndataloader_optimized = DataLoader(\n    dataset, \n    batch_size=TRAIN_BATCH_SIZE, \n    shuffle=True, \n    num_workers=NUM_WORKERS_RTX4090,\n    pin_memory=True,\n    persistent_workers=True if NUM_WORKERS_RTX4090 > 0 else False\n)\n\nlogger.info(f"üìä Dataset: {len(dataset):,} samples")\nlogger.info(f"üì¶ Batch Size: {TRAIN_BATCH_SI

In [None]:
# ===== LOAD ALL REQUIRED VARIABLES FOR TRAINING =====

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from torch.optim.lr_scheduler import CosineAnnealingLR
import time
import json
import os
import gc
from tqdm import tqdm
import logging
import pandas as pd

logger.info("\n" + "="*80)
logger.info("üì¶ LOADING ALL TRAINING VARIABLES & DATA")
logger.info("="*80)

# ===== CHECKPOINT & DATA PATHS =====
CHECKPOINT_DIR = "./checkpoints"
VOCAB_FILE = f"{CHECKPOINT_DIR}/vocab_full.json"
ENCODED_LOGS_FILE = f"{CHECKPOINT_DIR}/encoded_logs.npy"
CLEANED_DATA_FILE = f"{CHECKPOINT_DIR}/temp_cleaned.csv"

os.makedirs(CHECKPOINT_DIR, exist_ok=True)
logger.info(f"üìÅ Checkpoint directory: {CHECKPOINT_DIR}")

# ===== DEVICE CONFIGURATION =====
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"üñ•Ô∏è  Device: {DEVICE}")

# ===== LOAD VOCABULARY =====
logger.info(f"\nüìö Loading vocabulary from {VOCAB_FILE}...")
if not os.path.exists(VOCAB_FILE):
    raise FileNotFoundError(f"‚ùå Vocabulary file not found: {VOCAB_FILE}\n"
                          "Please run the vocabulary building cell first!")

with open(VOCAB_FILE, 'r') as f:
    vocab_data = json.load(f)

vocab_dict = vocab_data["vocab"]
inverse_vocab = vocab_data["inverse_vocab"]
VOCAB_SIZE = vocab_data["size"]
PAD_TOKEN = vocab_data["pad_token"]
UNK_TOKEN = vocab_data["unk_token"]

pad_id = vocab_dict[PAD_TOKEN]
unk_id = vocab_dict[UNK_TOKEN]

logger.info(f"‚úÖ Vocabulary loaded: {VOCAB_SIZE:,} tokens")
logger.info(f"   PAD token: '{PAD_TOKEN}' (ID: {pad_id})")
logger.info(f"   UNK token: '{UNK_TOKEN}' (ID: {unk_id})")

# ===== LOAD ENCODED LOGS =====
logger.info(f"\nüìä Loading encoded logs from {ENCODED_LOGS_FILE}...")
if not os.path.exists(ENCODED_LOGS_FILE):
    raise FileNotFoundError(f"‚ùå Encoded logs file not found: {ENCODED_LOGS_FILE}\n"
                          "Please run the encoding cell first!")

# Memory-map large file to avoid loading entire array into RAM
input_ids_matrix = np.load(ENCODED_LOGS_FILE, mmap_mode='r')
logger.info(f"‚úÖ Encoded logs loaded (memory-mapped): {input_ids_matrix.shape}")
logger.info(f"   Dtype: {input_ids_matrix.dtype}")
logger.info(f"   Memory: {input_ids_matrix.nbytes / 1e9:.2f} GB")

# ===== LOAD CLEANED DATA (for reference) =====
logger.info(f"\nüìÑ Loading cleaned logs from {CLEANED_DATA_FILE}...")
if os.path.exists(CLEANED_DATA_FILE):
    try:
        df_cleaned = pd.read_csv(CLEANED_DATA_FILE)
        logger.info(f"‚úÖ Cleaned data loaded: {len(df_cleaned):,} rows")
    except Exception as e:
        logger.warning(f"‚ö†Ô∏è Could not load cleaned data: {e}")
        df_cleaned = None
else:
    logger.warning(f"‚ö†Ô∏è Cleaned data file not found: {CLEANED_DATA_FILE}")
    df_cleaned = None

# ===== MODEL CONFIGURATION =====
MAX_LEN = 128
EMBED_DIM = 48
LATENT_DIM = 24

logger.info(f"\nü§ñ Model configuration:")
logger.info(f"   Vocab Size: {VOCAB_SIZE}")
logger.info(f"   Max Sequence Length: {MAX_LEN}")
logger.info(f"   Embedding Dimension: {EMBED_DIM}")
logger.info(f"   Latent Dimension: {LATENT_DIM}")

# ===== RTX 4090 OPTIMIZED HYPERPARAMETERS =====
# Auto-detect GPU capabilities
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    total_vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    logger.info(f"\nüî• GPU Detected: {gpu_name} ({total_vram:.1f} GB VRAM)")
    
    if total_vram >= 24:
        EPOCHS_RTX4090 = 10
        TRAIN_BATCH_SIZE_RTX4090 = 1024
        LEARNING_RATE_RTX4090 = 0.004
        NUM_WORKERS_RTX4090 = 2
    elif total_vram >= 16:
        EPOCHS_RTX4090 = 12
        TRAIN_BATCH_SIZE_RTX4090 = 512
        LEARNING_RATE_RTX4090 = 0.003
        NUM_WORKERS_RTX4090 = 2
    else:
        EPOCHS_RTX4090 = 15
        TRAIN_BATCH_SIZE_RTX4090 = 256
        LEARNING_RATE_RTX4090 = 0.002
        NUM_WORKERS_RTX4090 = 0
else:
    logger.warning("‚ö†Ô∏è No GPU detected! Using CPU (training will be slow)")
    EPOCHS_RTX4090 = 15
    TRAIN_BATCH_SIZE_RTX4090 = 64
    LEARNING_RATE_RTX4090 = 0.002
    NUM_WORKERS_RTX4090 = 0

USE_MIXED_PRECISION_RTX4090 = True if torch.cuda.is_available() else False
GRADIENT_ACCUMULATION_STEPS_RTX4090 = 1

logger.info(f"‚ö° Training Configuration:")
logger.info(f"   Epochs: {EPOCHS_RTX4090}")
logger.info(f"   Batch Size: {TRAIN_BATCH_SIZE_RTX4090}")
logger.info(f"   Learning Rate: {LEARNING_RATE_RTX4090}")
logger.info(f"   Mixed Precision: {USE_MIXED_PRECISION_RTX4090}")
logger.info(f"   Gradient Accumulation Steps: {GRADIENT_ACCUMULATION_STEPS_RTX4090}")
logger.info(f"   Num Workers: {NUM_WORKERS_RTX4090}")

# ===== CREATE DATASET =====
logger.info(f"\nüì¶ Creating dataset...")

class LogDataset(Dataset):
    def __init__(self, data, max_len=MAX_LEN):
        self.data = data
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        seq = self.data[idx]
        # Ensure sequence is proper length
        if len(seq) < self.max_len:
            seq = np.pad(seq, (0, self.max_len - len(seq)), constant_values=pad_id)
        else:
            seq = seq[:self.max_len]
        return torch.tensor(seq, dtype=torch.long)

dataset = LogDataset(input_ids_matrix)
logger.info(f"‚úÖ Dataset created: {len(dataset):,} samples")

# ===== CREATE OPTIMIZED DATALOADER =====
dataloader_optimized = DataLoader(
    dataset,
    batch_size=TRAIN_BATCH_SIZE_RTX4090,
    shuffle=True,
    num_workers=NUM_WORKERS_RTX4090,
    pin_memory=True if torch.cuda.is_available() else False,
    drop_last=True,
    persistent_workers=True if NUM_WORKERS_RTX4090 > 0 else False
)

logger.info(f"‚úÖ DataLoader created:")
logger.info(f"   Batch Size: {TRAIN_BATCH_SIZE_RTX4090}")
logger.info(f"   Batches per Epoch: {len(dataloader_optimized)}")
logger.info(f"   Num Workers: {NUM_WORKERS_RTX4090}")

# ===== CREATE MODEL =====
logger.info(f"\nü§ñ Creating model...")

class LogAutoEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, max_len=MAX_LEN):
        super(LogAutoEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.encoder = nn.LSTM(embed_dim, latent_dim, batch_first=True, num_layers=1)
        self.decoder = nn.LSTM(latent_dim, embed_dim, batch_first=True, num_layers=1)
        self.output_linear = nn.Linear(embed_dim, vocab_size)
        self.max_len = max_len

    def forward(self, x):
        embedded = self.embedding(x)
        _, (h_n, c_n) = self.encoder(embedded)
        latent = h_n.squeeze(0)
        latent_repeated = latent.unsqueeze(1).expand(-1, self.max_len, -1)
        decoded, _ = self.decoder(latent_repeated)
        reconstruction = self.output_linear(decoded)
        return reconstruction, latent

model = LogAutoEncoder(
    vocab_size=VOCAB_SIZE,
    embed_dim=EMBED_DIM,
    latent_dim=LATENT_DIM,
    max_len=MAX_LEN
).to(DEVICE)

total_params = sum(p.numel() for p in model.parameters())
logger.info(f"‚úÖ Model created: {total_params:,} parameters")

# ===== CREATE LOSS FUNCTION =====
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)
logger.info(f"‚úÖ Loss function: CrossEntropyLoss (ignore_index={pad_id})")

# ===== CREATE OPTIMIZER =====
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE_RTX4090, weight_decay=1e-5)
logger.info(f"‚úÖ Optimizer: Adam (lr={LEARNING_RATE_RTX4090})")

# ===== CREATE SCHEDULER =====
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS_RTX4090, eta_min=1e-6)
logger.info(f"‚úÖ Scheduler: CosineAnnealingLR (T_max={EPOCHS_RTX4090})")

# ===== MIXED PRECISION SETUP =====
if USE_MIXED_PRECISION_RTX4090:
    scaler = GradScaler("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"‚úÖ Mixed Precision: Enabled with GradScaler")
else:
    scaler = None
    logger.info(f"‚ö™ Mixed Precision: Disabled")

# ===== GPU OPTIMIZATIONS =====
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    logger.info(f"\nüî• GPU Optimizations:")
    logger.info(f"   cuDNN Benchmark: True")
    logger.info(f"   TF32 Enabled: True")
    logger.info(f"   Cache Cleared")

# ===== TRAINING MONITOR CLASS =====
class TrainingMonitor:
    def __init__(self):
        self.batch_times = []
        self.loss_history = []
        self.gpu_memory_history = []
        
    def update(self, loss, batch_time):
        self.loss_history.append(loss)
        self.batch_times.append(batch_time)
        
        if torch.cuda.is_available():
            self.gpu_memory_history.append(torch.cuda.memory_allocated(0) / 1e9)
    
    def get_avg_batch_time(self, window=100):
        if len(self.batch_times) < window:
            return np.mean(self.batch_times) if self.batch_times else 0
        return np.mean(self.batch_times[-window:])
    
    def get_estimated_time_remaining(self, batches_per_epoch, remaining_epochs, remaining_batches):
        avg_time = self.get_avg_batch_time()
        if avg_time == 0:
            return 0, 0
        total_remaining_batches = (remaining_epochs * batches_per_epoch) + remaining_batches
        seconds_remaining = total_remaining_batches * avg_time
        hours = seconds_remaining / 3600
        minutes = (seconds_remaining % 3600) / 60
        return hours, minutes

monitor = TrainingMonitor()
logger.info(f"‚úÖ Training monitor initialized")

# Clean up memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

logger.info("\n" + "="*80)
logger.info("‚ú® ALL VARIABLES LOADED SUCCESSFULLY - READY FOR TRAINING!")
logger.info("="*80)

2026-01-02 16:37:45,810 - INFO - 
2026-01-02 16:37:45,812 - INFO - üì¶ LOADING ALL TRAINING VARIABLES & DATA
2026-01-02 16:37:45,814 - INFO - üìÅ Checkpoint directory: ./checkpoints
2026-01-02 16:37:46,266 - INFO - üñ•Ô∏è  Device: cuda
2026-01-02 16:37:46,267 - INFO - 
üìö Loading vocabulary from ./checkpoints/vocab_full.json...
2026-01-02 16:37:46,318 - INFO - ‚úÖ Vocabulary loaded: 50,000 tokens
2026-01-02 16:37:46,318 - INFO -    PAD token: '<PAD>' (ID: 0)
2026-01-02 16:37:46,319 - INFO -    UNK token: '<UNK>' (ID: 1)
2026-01-02 16:37:46,319 - INFO - 
üìä Loading encoded logs from ./checkpoints/encoded_logs.npy...
2026-01-02 16:37:46,322 - INFO - ‚úÖ Encoded logs loaded (memory-mapped): (246979255, 128)
2026-01-02 16:37:46,323 - INFO -    Dtype: int32
2026-01-02 16:37:46,323 - INFO -    Memory: 126.45 GB
2026-01-02 16:37:46,324 - INFO - 
üìÑ Loading cleaned logs from ./checkpoints/temp_cleaned.csv...


In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
import time
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler

# ===== CREATE OPTIMIZED DATALOADER =====

dataloader_optimized = DataLoader(
    dataset,
    batch_size=TRAIN_BATCH_SIZE_RTX4090,
    shuffle=True,
    num_workers=NUM_WORKERS_RTX4090,
    pin_memory=True,
    drop_last=True
)

# Initialize mixed precision scaler
USE_MIXED_PRECISION = USE_MIXED_PRECISION_RTX4090
GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS_RTX4090

if USE_MIXED_PRECISION:
    scaler = GradScaler()

# ===== ENHANCED PROGRESS MONITORING =====

# Create a custom callback to monitor training in real-time
class TrainingMonitor:
    def __init__(self):
        self.batch_times = []
        self.loss_history = []
        self.gpu_memory_history = []
        
    def update(self, loss, batch_time):
        self.loss_history.append(loss)
        self.batch_times.append(batch_time)
        
        if torch.cuda.is_available():
            self.gpu_memory_history.append(torch.cuda.memory_allocated(0) / 1e9)
    
    def get_avg_batch_time(self, window=100):
        if len(self.batch_times) < window:
            return np.mean(self.batch_times)
        return np.mean(self.batch_times[-window:])
    
    def get_estimated_time_remaining(self, batches_per_epoch, remaining_epochs, remaining_batches):
        avg_time = self.get_avg_batch_time()
        total_remaining_batches = (remaining_epochs * batches_per_epoch) + remaining_batches
        seconds_remaining = total_remaining_batches * avg_time
        hours = seconds_remaining / 3600
        minutes = (seconds_remaining % 3600) / 60
        return hours, minutes

monitor = TrainingMonitor()

# ===== OPTIMIZED TRAINING WITH DETAILED MONITORING =====

# Use the RTX 4090 configuration variables
EPOCHS = EPOCHS_RTX4090

logger.info(f"\nüöÄ STARTING OPTIMIZED TRAINING WITH RTX 4090")
logger.info(f"="*80)

train_losses = []
start_time = time.time()

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    num_batches = 0
    epoch_start = time.time()
    
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
    
    pbar = tqdm(
        enumerate(dataloader_optimized),
        total=len(dataloader_optimized),
        desc=f"Epoch {epoch+1}/{EPOCHS}",
        leave=True,
        ncols=140  # Extra wide for detailed info
    )
    
    for batch_idx, batch_data in pbar:
        batch_start = time.time()
        batch_data = batch_data.to(DEVICE, non_blocking=True)
        
        if USE_MIXED_PRECISION:
            with autocast(device_type="cuda", dtype=torch.float16):
                reconstruction, latent = model(batch_data)
                batch_size, seq_len, vocab_size = reconstruction.shape
                
                reconstruction_flat = reconstruction.view(-1, vocab_size)
                target_flat = batch_data.view(-1)
                loss = criterion(reconstruction_flat, target_flat)
                
                loss = loss / GRADIENT_ACCUMULATION_STEPS
            
            scaler.scale(loss).backward()
        else:
            optimizer.zero_grad()
            reconstruction, latent = model(batch_data)
            batch_size, seq_len, vocab_size = reconstruction.shape
            
            reconstruction_flat = reconstruction.view(-1, vocab_size)
            target_flat = batch_data.view(-1)
            loss = criterion(reconstruction_flat, target_flat)
            loss = loss / GRADIENT_ACCUMULATION_STEPS
            loss.backward()
        
        if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            if USE_MIXED_PRECISION:
                scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            if USE_MIXED_PRECISION:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            
            optimizer.zero_grad()
        
        batch_time = time.time() - batch_start
        total_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        num_batches += 1
        
        # Update monitor
        monitor.update(loss.item() * GRADIENT_ACCUMULATION_STEPS, batch_time)
        
        # Calculate GPU memory
        gpu_mem_used = 0
        gpu_mem_total = 0
        if torch.cuda.is_available():
            gpu_mem_used = torch.cuda.memory_allocated(0) / 1e9
            gpu_mem_total = torch.cuda.get_device_properties(0).total_memory / 1e9
        
        # Calculate estimated time remaining
        remaining_epochs = EPOCHS - epoch - 1
        remaining_batches = len(dataloader_optimized) - batch_idx - 1
        eta_hours, eta_minutes = monitor.get_estimated_time_remaining(
            len(dataloader_optimized), 
            remaining_epochs, 
            remaining_batches
        )
        
        # Update progress bar with comprehensive info
        pbar.set_postfix({
            'loss': f'{loss.item() * GRADIENT_ACCUMULATION_STEPS:.4f}',
            'lr': f'{optimizer.param_groups[0]["lr"]:.5f}',
            'GPU': f'{gpu_mem_used:.1f}/{gpu_mem_total:.1f}GB',
            'batch_time': f'{batch_time*1000:.0f}ms',
            'ETA': f'{int(eta_hours)}h{int(eta_minutes)}m' if eta_hours > 0 or eta_minutes > 0 else 'Soon'
        })
    
    scheduler.step()
    
    avg_loss = total_loss / num_batches
    train_losses.append(avg_loss)
    
    epoch_time = time.time() - epoch_start
    current_lr = optimizer.param_groups[0]['lr']
    
    logger.info(f"\n‚úÖ Epoch {epoch+1}/{EPOCHS} Summary:")
    logger.info(f"   Loss: {avg_loss:.4f} | LR: {current_lr:.6f} | Time: {epoch_time/60:.1f}min")
    logger.info(f"   Avg batch time: {monitor.get_avg_batch_time()*1000:.0f}ms")
    
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated(0) / 1e9
        logger.info(f"   Peak GPU Memory: {peak_memory:.2f} GB")
    
    # Save checkpoint
    checkpoint_path = f"{CHECKPOINT_DIR}/autoencoder_rtx4090_epoch_{epoch+1}.pth"
    
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'loss': avg_loss,
        'vocab_size': VOCAB_SIZE,
        'embed_dim': 48,
        'latent_dim': 24,
        'max_len': MAX_LEN,
    }, checkpoint_path)
    logger.info(f"   üíæ Checkpoint saved: {checkpoint_path}")

total_time = time.time() - start_time
logger.info(f"\n" + "="*80)
logger.info(f"üéâ TRAINING COMPLETE!")
logger.info(f"="*80)
logger.info(f"‚è±Ô∏è  Total training time: {total_time/60:.1f} minutes ({total_time/3600:.2f} hours)")
logger.info(f"üìä Final loss: {train_losses[-1]:.4f}")
logger.info(f"üìâ Loss reduction: {((train_losses[0] - train_losses[-1])/train_losses[0]*100):.1f}%")

if torch.cuda.is_available():
    peak_memory = torch.cuda.max_memory_allocated(0) / 1e9
    logger.info(f"üñ•Ô∏è  Peak GPU Memory: {peak_memory:.2f} GB / {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    torch.cuda.empty_cache()
    logger.info(f"üßπ GPU cache cleared")

logger.info(f"\n‚ú® Model ready for inference or further optimization!")


  scaler = GradScaler()
2026-01-02 16:40:08,389 - INFO - 
üöÄ STARTING OPTIMIZED TRAINING WITH RTX 4090


MemoryError: 

In [None]:
# ===== PLOT TRAINING LOSS =====

plt.figure(figsize=(10, 6))
plt.plot(train_losses, marker='o', linewidth=2, markersize=8)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Average Loss', fontsize=12)
plt.title('Autoencoder Training Loss', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f"{CHECKPOINT_DIR}/training_loss.png", dpi=100, bbox_inches='tight')
plt.show()

logger.info(f"üìä Training loss plot saved to {CHECKPOINT_DIR}/training_loss.png")


In [None]:
# ===== LOAD PRETRAINED MODEL FROM CHECKPOINT =====
import os
import torch
import torch.nn as nn

# Find latest checkpoint
checkpoint_dir = "checkpoints"
checkpoints = sorted([f for f in os.listdir(checkpoint_dir) if f.startswith("autoencoder_epoch_") and f.endswith(".pth")])

if checkpoints:
    latest_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])
    logger.info(f"üìÇ Found {len(checkpoints)} checkpoints. Loading latest: {checkpoints[-1]}")
    
    # Load checkpoint metadata
    checkpoint = torch.load(latest_checkpoint, map_location=DEVICE)
    
    # Recreate model with saved parameters
    logger.info(f"ü§ñ Reconstructing model from checkpoint...")
    model = LogAutoEncoder(
        vocab_size=checkpoint['vocab_size'],
        embed_dim=checkpoint['embed_dim'],
        latent_dim=checkpoint['latent_dim']
    ).to(DEVICE)
    
    # Load model state
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()  # Set to evaluation mode
    
    logger.info(f"‚úÖ Model loaded from epoch {checkpoint['epoch']}")
    logger.info(f"üìä Training loss at checkpoint: {checkpoint['loss']:.4f}")
    logger.info(f"üìà Total parameters: {sum(p.numel() for p in model.parameters()):,}")
    
else:
    logger.warning("‚ö†Ô∏è No checkpoints found! Train the model first.")
    model = None


2026-01-02 17:25:10,009 - INFO - üìÇ Found 10 checkpoints. Loading latest: autoencoder_epoch_9.pth
2026-01-02 17:25:10,392 - INFO - ü§ñ Reconstructing model from checkpoint...
2026-01-02 17:25:10,426 - INFO - ‚úÖ Model loaded from epoch 9
2026-01-02 17:25:10,427 - INFO - üìä Training loss at checkpoint: 3.3391
2026-01-02 17:25:10,427 - INFO - üìà Total parameters: 4,871,312


In [None]:
# ===== ANOMALY DETECTION WITH TRAINED MODEL =====

def compute_reconstruction_error(model, batch, device):
    """
    Compute reconstruction error (anomaly score) for a batch of logs.
    Higher error = more anomalous
    """
    with torch.no_grad():
        batch_tensor = torch.tensor(batch, dtype=torch.long).to(device)
        reconstruction, latent = model(batch_tensor)
        
        # Flatten for loss computation
        batch_size, seq_len, vocab_size = reconstruction.shape
        reconstruction_flat = reconstruction.view(-1, vocab_size)
        target_flat = batch_tensor.view(-1)
        
        # Compute cross-entropy per sample
        ce_loss = nn.CrossEntropyLoss(ignore_index=0, reduction='none')
        loss_per_token = ce_loss(reconstruction_flat, target_flat)
        loss_per_sample = loss_per_token.view(batch_size, seq_len).mean(dim=1)
        
        return loss_per_sample.cpu().numpy(), latent.cpu().numpy()

# Test on a sample batch
if model is not None:
    logger.info("\n" + "="*60)
    logger.info("üîç COMPUTING ANOMALY SCORES ON TRAINING DATA")
    logger.info("="*60)
    
    # Get sample batches
    sample_indices = np.random.choice(len(dataset), size=min(1000, len(dataset)), replace=False)
    sample_batch = input_ids_matrix[sample_indices[:256]]  # First batch
    
    errors, latents = compute_reconstruction_error(model, sample_batch, DEVICE)
    
    logger.info(f"‚úÖ Computed anomaly scores for {len(errors)} samples")
    logger.info(f"üìä Reconstruction Error Statistics:")
    logger.info(f"   Mean:   {errors.mean():.4f}")
    logger.info(f"   Std:    {errors.std():.4f}")
    logger.info(f"   Min:    {errors.min():.4f}")
    logger.info(f"   Max:    {errors.max():.4f}")
    logger.info(f"   Median: {np.median(errors):.4f}")
    
    # Identify potential anomalies (top 10%)
    threshold = np.percentile(errors, 90)
    anomalies = np.where(errors > threshold)[0]
    logger.info(f"\nüö® Found {len(anomalies)} potential anomalies (top 10% by error)")
    logger.info(f"   Anomaly threshold: {threshold:.4f}")


2026-01-02 17:25:12,243 - INFO - 
2026-01-02 17:25:12,244 - INFO - üîç COMPUTING ANOMALY SCORES ON TRAINING DATA
2026-01-02 17:25:34,977 - INFO - ‚úÖ Computed anomaly scores for 256 samples
2026-01-02 17:25:34,992 - INFO - üìä Reconstruction Error Statistics:
2026-01-02 17:25:35,002 - INFO -    Mean:   0.5012
2026-01-02 17:25:35,009 - INFO -    Std:    0.3662
2026-01-02 17:25:35,010 - INFO -    Min:    0.0456
2026-01-02 17:25:35,011 - INFO -    Max:    4.1968
2026-01-02 17:25:35,022 - INFO -    Median: 0.4456
2026-01-02 17:25:35,033 - INFO - 
üö® Found 26 potential anomalies (top 10% by error)
2026-01-02 17:25:35,035 - INFO -    Anomaly threshold: 0.8118


In [None]:
# --- DATASET & DATALOADER (R√âUTILISATION) ---

# La classe LogDataset est d√©j√† d√©finie plus haut, on peut la r√©utiliser directement
# Cr√©er l'objet Dataset
dataset = LogDataset(input_ids_matrix)

# Cr√©er le DataLoader
# ‚ö†Ô∏è IMPORTANT: num_workers=0 sur Windows! C'est essentiel pour la performance
dataloader = DataLoader(
    dataset, 
    batch_size=256, 
    shuffle=True, 
    num_workers=0,  # ‚Üê WINDOWS: Toujours 0 !
    pin_memory=True if torch.cuda.is_available() else False
)

logger.info(f"‚úÖ DataLoader pr√™t. Batches: {len(dataloader)}")
logger.info(f"üì¶ Configuration optimale pour Windows: num_workers=0, pin_memory={torch.cuda.is_available()}")

2026-01-02 17:25:40,471 - INFO - ‚úÖ DataLoader pr√™t. Batches: 964763
2026-01-02 17:25:40,471 - INFO - üì¶ Configuration optimale pour Windows: num_workers=0, pin_memory=True


## 5. Using the Model for Anomaly Detection

Now you can use the trained model to detect anomalies in your logs. Here's how to:
1. Score logs with the reconstruction error
2. Set a threshold to identify anomalies
3. Analyze suspicious logs



In [None]:
# ===== STEP 1: COMPUTE ANOMALY SCORES ON FULL DATASET =====

logger.info("\n" + "="*70)
logger.info("üìä STEP 1: Computing Anomaly Scores on Full Dataset")
logger.info("="*70)

if model is None:
    logger.error("‚ùå Model not loaded! Please train or load a checkpoint first.")
else:
    # Compute scores for all logs in batches
    all_errors = []
    all_latents = []
    
    INFERENCE_BATCH_SIZE = 512  # Larger batch for inference (no gradients)
    inference_loader = DataLoader(dataset, batch_size=INFERENCE_BATCH_SIZE, shuffle=False, num_workers=0)
    
    logger.info(f"üîÑ Processing {len(dataset):,} logs in {len(inference_loader)} batches...")
    
    with torch.no_grad():
        for batch in tqdm(inference_loader, desc="Computing Anomaly Scores"):
            batch = batch.to(DEVICE)
            errors, latents = compute_reconstruction_error(model, batch.cpu().numpy(), DEVICE)
            all_errors.append(errors)
            all_latents.append(latents)
    
    # Concatenate all scores
    all_errors = np.concatenate(all_errors, axis=0)
    all_latents = np.concatenate(all_latents, axis=0)
    
    logger.info(f"‚úÖ Computed {len(all_errors):,} anomaly scores")
    logger.info(f"\nüìä RECONSTRUCTION ERROR STATISTICS:")
    logger.info(f"   Mean:     {all_errors.mean():.4f}")
    logger.info(f"   Std:      {all_errors.std():.4f}")
    logger.info(f"   Min:      {all_errors.min():.4f}")
    logger.info(f"   Max:      {all_errors.max():.4f}")
    logger.info(f"   Median:   {np.median(all_errors):.4f}")
    logger.info(f"   Q75:      {np.percentile(all_errors, 75):.4f}")
    logger.info(f"   Q90:      {np.percentile(all_errors, 90):.4f}")
    logger.info(f"   Q95:      {np.percentile(all_errors, 95):.4f}")


2026-01-02 17:26:04,151 - INFO - 
2026-01-02 17:26:04,152 - INFO - üìä STEP 1: Computing Anomaly Scores on Full Dataset
2026-01-02 17:26:04,154 - INFO - üîÑ Processing 246,979,255 logs in 482382 batches...
Computing Anomaly Scores:   0%|          | 0/482382 [00:00<?, ?it/s]

In [None]:
# ===== STEP 2: SET ANOMALY THRESHOLD AND IDENTIFY ANOMALIES =====

# Choose your anomaly threshold (higher = stricter, fewer anomalies detected)
# You can adjust this based on your needs
ANOMALY_THRESHOLD_PERCENTILE = 90  # Top 10% = anomalies

anomaly_threshold = np.percentile(all_errors, ANOMALY_THRESHOLD_PERCENTILE)
anomaly_indices = np.where(all_errors > anomaly_threshold)[0]

logger.info(f"\nüö® ANOMALY DETECTION (Threshold: {ANOMALY_THRESHOLD_PERCENTILE}th percentile)")
logger.info(f"   Threshold value: {anomaly_threshold:.4f}")
logger.info(f"   Anomalies found: {len(anomaly_indices):,} out of {len(all_errors):,}")
logger.info(f"   Anomaly rate: {len(anomaly_indices)/len(all_errors)*100:.2f}%")

# Create a DataFrame with results
results_df = pd.DataFrame({
    'log_index': np.arange(len(all_errors)),
    'reconstruction_error': all_errors,
    'is_anomaly': all_errors > anomaly_threshold
})

logger.info(f"\nüìä Top 20 Most Anomalous Logs:")
top_anomalies = results_df.nlargest(20, 'reconstruction_error')
print(top_anomalies[['log_index', 'reconstruction_error']].to_string())


In [None]:
# ===== STEP 3: VIEW ACTUAL LOGS THAT ARE ANOMALIES =====

def decode_log_from_ids(ids, inverse_vocab):
    """Convert token IDs back to text"""
    words = []
    for id_val in ids:
        if id_val == 0:  # PAD token
            continue
        word = inverse_vocab.get(str(id_val), "<UNK>")
        words.append(word)
    return " ".join(words)

# Load inverse vocabulary
with open(VOCAB_FILE, 'r') as f:
    vocab_data = json.load(f)
    inverse_vocab = vocab_data['inverse_vocab']

logger.info(f"\nüìù TOP 10 MOST ANOMALOUS LOGS (with original text):")
logger.info("="*80)

for rank, (idx, row) in enumerate(top_anomalies.head(10).iterrows(), 1):
    log_idx = int(row['log_index'])
    error = row['reconstruction_error']
    
    # Get the original log text
    log_ids = input_ids_matrix[log_idx]
    log_text = decode_log_from_ids(log_ids, inverse_vocab)
    
    print(f"\n[#{rank}] Log Index: {log_idx} | Error: {error:.4f}")
    print(f"     Raw:  {df['raw_log'].iloc[log_idx][:100]}...")
    print(f"     Cleaned: {df['cleaned_log'].iloc[log_idx][:100]}...")


In [None]:
# ===== STEP 4: VISUALIZATION OF ANOMALY DISTRIBUTION =====

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Histogram of reconstruction errors
axes[0, 0].hist(all_errors, bins=100, alpha=0.7, color='blue', edgecolor='black')
axes[0, 0].axvline(anomaly_threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold ({ANOMALY_THRESHOLD_PERCENTILE}%ile)')
axes[0, 0].set_xlabel('Reconstruction Error')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Reconstruction Errors')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Plot 2: Log-scale histogram
axes[0, 1].hist(all_errors, bins=100, alpha=0.7, color='green', edgecolor='black')
axes[0, 1].set_yscale('log')
axes[0, 1].axvline(anomaly_threshold, color='red', linestyle='--', linewidth=2)
axes[0, 1].set_xlabel('Reconstruction Error')
axes[0, 1].set_ylabel('Frequency (log scale)')
axes[0, 1].set_title('Distribution of Reconstruction Errors (Log Scale)')
axes[0, 1].grid(alpha=0.3)

# Plot 3: Sorted errors
sorted_errors = np.sort(all_errors)
axes[1, 0].plot(sorted_errors, color='purple', linewidth=1.5)
axes[1, 0].axhline(anomaly_threshold, color='red', linestyle='--', linewidth=2, label='Anomaly Threshold')
axes[1, 0].set_xlabel('Log Index (sorted)')
axes[1, 0].set_ylabel('Reconstruction Error')
axes[1, 0].set_title('Reconstruction Errors (Sorted)')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Plot 4: Box plot
axes[1, 1].boxplot(all_errors, vert=True)
axes[1, 1].axhline(anomaly_threshold, color='red', linestyle='--', linewidth=2, label='Anomaly Threshold')
axes[1, 1].set_ylabel('Reconstruction Error')
axes[1, 1].set_title('Box Plot of Reconstruction Errors')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f"{CHECKPOINT_DIR}/anomaly_detection_analysis.png", dpi=100, bbox_inches='tight')
plt.show()

logger.info(f"\nüìä Visualization saved to {CHECKPOINT_DIR}/anomaly_detection_analysis.png")


NameError: name 'plt' is not defined

In [None]:
# ===== STEP 5: SCORE A NEW LOG (PREDICTION) =====

def score_new_logs(raw_logs_list, model, vocab_dict, device, max_len=128):
    """
    Score new logs that are NOT in the original dataset.
    
    Args:
        raw_logs_list: List of raw log strings
        model: Trained autoencoder model
        vocab_dict: Vocabulary dictionary
        device: torch device (cuda or cpu)
        max_len: Maximum sequence length
    
    Returns:
        numpy array of anomaly scores
    """
    # Clean the logs using the same patterns
    cleaned_logs = []
    for log in raw_logs_list:
        cleaned = log
        for pat, repl in PATTERNS_PY.items():
            cleaned = re.sub(pat, repl, cleaned)
        cleaned_logs.append(cleaned)
    
    # Encode logs
    pad_id = vocab_dict['<PAD>']
    unk_id = vocab_dict['<UNK>']
    
    encoded_ids = []
    for text in cleaned_logs:
        ids = [vocab_dict.get(word, unk_id) for word in text.split()]
        # Pad or truncate
        ids = ids[:max_len]
        ids = ids + [pad_id] * (max_len - len(ids))
        encoded_ids.append(ids)
    
    # Convert to tensor
    encoded_tensor = torch.tensor(encoded_ids, dtype=torch.long).to(device)
    
    # Score
    with torch.no_grad():
        reconstruction, _ = model(encoded_tensor)
        batch_size, seq_len, vocab_size = reconstruction.shape
        
        reconstruction_flat = reconstruction.view(-1, vocab_size)
        target_flat = encoded_tensor.view(-1)
        
        ce_loss = nn.CrossEntropyLoss(ignore_index=pad_id, reduction='none')
        loss_per_token = ce_loss(reconstruction_flat, target_flat)
        loss_per_sample = loss_per_token.view(batch_size, seq_len).mean(dim=1)
        
        return loss_per_sample.cpu().numpy()

# Example: Score some new logs
example_logs = [
    "Connection established successfully",
    "ERROR: Invalid authentication token received",
    "WARNING: Disk space running low on server",
    "CRITICAL: Unexpected null pointer exception occurred",
    "INFO: Request processed in 123ms"
]

logger.info(f"\n" + "="*70)
logger.info("üß™ EXAMPLE: Scoring New Logs")
logger.info("="*70)

new_scores = score_new_logs(example_logs, model, vocab_dict, DEVICE)

logger.info(f"\nLog Scores:")
for i, (log, score) in enumerate(zip(example_logs, new_scores), 1):
    is_anomaly = "üö® ANOMALY" if score > anomaly_threshold else "‚úÖ NORMAL"
    logger.info(f"[{i}] {is_anomaly} | Score: {score:.4f} | Log: {log[:60]}")


## 6. Save Final Production Model

Save the trained model and all necessary artifacts for future use.



In [None]:
# ===== SAVE FINAL PRODUCTION MODEL =====

# Create a production directory
PRODUCTION_DIR = "production_model"
os.makedirs(PRODUCTION_DIR, exist_ok=True)

logger.info(f"\n" + "="*70)
logger.info("üíæ SAVING FINAL PRODUCTION MODEL")
logger.info("="*70)

if model is None:
    logger.error("‚ùå Model not loaded! Cannot save.")
else:
    # 1. Save the trained model
    final_model_path = f"{PRODUCTION_DIR}/autoencoder_final.pth"
    torch.save({
        'epoch': checkpoint['epoch'],
        'model_state_dict': model.state_dict(),
        'vocab_size': VOCAB_SIZE,
        'embed_dim': 48,
        'latent_dim': 24,
        'max_len': MAX_LEN,
        'device': str(DEVICE),
        'training_loss': checkpoint['loss'],
        'anomaly_threshold': float(anomaly_threshold),
        'threshold_percentile': ANOMALY_THRESHOLD_PERCENTILE,
    }, final_model_path)
    logger.info(f"‚úÖ Model saved: {final_model_path}")
    
    # 2. Save vocabulary
    vocab_path = f"{PRODUCTION_DIR}/vocab.json"
    with open(vocab_path, 'w') as f:
        json.dump(vocab_data, f, indent=2)
    logger.info(f"‚úÖ Vocabulary saved: {vocab_path}")
    
    # 3. Save cleaning patterns
    patterns_path = f"{PRODUCTION_DIR}/cleaning_patterns.json"
    patterns_dict = {pattern: replacement for pattern, replacement in PATTERNS_PY.items()}
    with open(patterns_path, 'w') as f:
        json.dump(patterns_dict, f, indent=2)
    logger.info(f"‚úÖ Cleaning patterns saved: {patterns_path}")
    
    # 4. Save configuration
    config_path = f"{PRODUCTION_DIR}/config.json"
    config = {
        'vocab_size': VOCAB_SIZE,
        'embed_dim': 48,
        'latent_dim': 24,
        'max_len': MAX_LEN,
        'pad_token': '<PAD>',
        'unk_token': '<UNK>',
        'anomaly_threshold': float(anomaly_threshold),
        'threshold_percentile': ANOMALY_THRESHOLD_PERCENTILE,
        'model_type': 'LogAutoEncoder',
        'training_epochs': checkpoint['epoch'],
        'final_loss': float(checkpoint['loss']),
    }
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=2)
    logger.info(f"‚úÖ Configuration saved: {config_path}")
    
    # 5. Save anomaly detection results
    results_path = f"{PRODUCTION_DIR}/anomaly_detection_results.csv"
    results_df.to_csv(results_path, index=False)
    logger.info(f"‚úÖ Anomaly detection results saved: {results_path} ({len(results_df):,} logs)")
    
    logger.info(f"\nüì¶ PRODUCTION MODEL SAVED SUCCESSFULLY!")
    logger.info(f"üìÅ Directory: {PRODUCTION_DIR}/")
    logger.info(f"\nContents:")
    logger.info(f"   - autoencoder_final.pth    (Model weights)")
    logger.info(f"   - vocab.json                (Vocabulary)")
    logger.info(f"   - cleaning_patterns.json    (Regex patterns)")
    logger.info(f"   - config.json               (Configuration)")
    logger.info(f"   - anomaly_detection_results.csv (Scores for all logs)")
    
    print("\n‚ú® Model ready for production use! ‚ú®")


In [None]:
# ===== CREATE A REUSABLE INFERENCE SCRIPT =====

inference_script = '''"""
Standalone Log Anomaly Detection Script
Load the trained model and score logs on demand.
"""
import torch
import json
import re
import numpy as np
import torch.nn as nn
from pathlib import Path

class LogAutoEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=48, latent_dim=24):
        super(LogAutoEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.encoder = nn.LSTM(embed_dim, latent_dim, batch_first=True, num_layers=1)
        self.decoder = nn.LSTM(latent_dim, embed_dim, batch_first=True, num_layers=1)
        self.output_linear = nn.Linear(embed_dim, vocab_size)
        self.max_len = 128

    def forward(self, x):
        embedded = self.embedding(x)
        _, (h_n, c_n) = self.encoder(embedded)
        latent = h_n.squeeze(0)
        latent_repeated = latent.unsqueeze(1).expand(-1, self.max_len, -1)
        decoded, _ = self.decoder(latent_repeated)
        reconstruction = self.output_linear(decoded)
        return reconstruction, latent

class AnomalyDetector:
    def __init__(self, model_dir="production_model"):
        """Load model and configuration"""
        self.model_dir = Path(model_dir)
        
        # Load config
        with open(self.model_dir / "config.json") as f:
            self.config = json.load(f)
        
        # Load vocabulary
        with open(self.model_dir / "vocab.json") as f:
            vocab_data = json.load(f)
            self.vocab = vocab_data['vocab']
        
        # Load cleaning patterns
        with open(self.model_dir / "cleaning_patterns.json") as f:
            self.patterns = json.load(f)
        
        # Load model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = LogAutoEncoder(
            vocab_size=self.config['vocab_size'],
            embed_dim=self.config['embed_dim'],
            latent_dim=self.config['latent_dim']
        ).to(self.device)
        
        checkpoint = torch.load(
            self.model_dir / "autoencoder_final.pth",
            map_location=self.device
        )
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()
        
        self.threshold = checkpoint['anomaly_threshold']
        print(f"‚úÖ Model loaded from {model_dir}")
        print(f"   Anomaly threshold: {self.threshold:.4f}")
    
    def clean_log(self, log_text):
        """Clean a log using regex patterns"""
        cleaned = log_text
        for pattern, replacement in self.patterns.items():
            cleaned = re.sub(pattern, replacement, cleaned)
        return cleaned
    
    def score_logs(self, logs):
        """Score a list of logs (raw text)"""
        pad_id = self.vocab['<PAD>']
        unk_id = self.vocab['<UNK>']
        
        # Clean and encode
        encoded_ids = []
        cleaned_logs = []
        
        for log in logs:
            cleaned = self.clean_log(log)
            cleaned_logs.append(cleaned)
            
            ids = [self.vocab.get(word, unk_id) for word in cleaned.split()]
            ids = ids[:self.config['max_len']]
            ids = ids + [pad_id] * (self.config['max_len'] - len(ids))
            encoded_ids.append(ids)
        
        # Convert to tensor
        tensor = torch.tensor(encoded_ids, dtype=torch.long).to(self.device)
        
        # Score
        with torch.no_grad():
            reconstruction, _ = self.model(tensor)
            batch_size, seq_len, vocab_size = reconstruction.shape
            
            reconstruction_flat = reconstruction.view(-1, vocab_size)
            target_flat = tensor.view(-1)
            
            ce_loss = nn.CrossEntropyLoss(ignore_index=pad_id, reduction='none')
            loss_per_token = ce_loss(reconstruction_flat, target_flat)
            loss_per_sample = loss_per_token.view(batch_size, seq_len).mean(dim=1)
            
            return loss_per_sample.cpu().numpy()
    
    def detect_anomalies(self, logs):
        """Score logs and classify as anomaly/normal"""
        scores = self.score_logs(logs)
        
        results = []
        for log, score in zip(logs, scores):
            results.append({
                'log': log,
                'score': float(score),
                'is_anomaly': score > self.threshold,
                'threshold': self.threshold
            })
        
        return results

# Example usage
if __name__ == "__main__":
    detector = AnomalyDetector()
    
    test_logs = [
        "Connection established successfully",
        "ERROR: Unexpected exception occurred",
        "INFO: Processing request"
    ]
    
    results = detector.detect_anomalies(test_logs)
    
    for result in results:
        status = "üö® ANOMALY" if result['is_anomaly'] else "‚úÖ NORMAL"
        print(f"{status} | Score: {result['score']:.4f} | {result['log'][:60]}")
'''

# Save the inference script
inference_path = f"{PRODUCTION_DIR}/anomaly_detector.py"
with open(inference_path, 'w') as f:
    f.write(inference_script)

logger.info(f"\n‚úÖ Inference script saved: {inference_path}")
logger.info(f"\nüìñ Usage:")
logger.info(f"   from anomaly_detector import AnomalyDetector")
logger.info(f"   detector = AnomalyDetector('production_model')")
logger.info(f"   results = detector.detect_anomalies(['your log text'])")


In [None]:
# ===== CREATE A README FOR THE PRODUCTION MODEL =====

readme_content = """# üéØ Log Anomaly Detection Model - Production Package

## üì¶ What's Inside

This folder contains a fully trained and production-ready log anomaly detection model.

### Files:
- **autoencoder_final.pth** - Trained PyTorch model weights
- **vocab.json** - Vocabulary dictionary (word ‚Üí ID mapping)
- **cleaning_patterns.json** - Regex patterns for log preprocessing
- **config.json** - Model configuration and hyperparameters
- **anomaly_detection_results.csv** - Anomaly scores for all training logs
- **anomaly_detector.py** - Standalone Python script for inference

## üöÄ Quick Start

### Option 1: Using the Python Script (Recommended)

```python
from anomaly_detector import AnomalyDetector

# Initialize
detector = AnomalyDetector('production_model')

# Score your logs
logs = [
    "Connection established successfully",
    "ERROR: Unexpected null pointer exception",
    "INFO: Request processed"
]

results = detector.detect_anomalies(logs)

for result in results:
    print(f"Log: {result['log']}")
    print(f"Score: {result['score']:.4f}")
    print(f"Is Anomaly: {result['is_anomaly']}")
    print()
```

### Option 2: Direct PyTorch Loading

```python
import torch
import json

# Load config
with open('production_model/config.json') as f:
    config = json.load(f)

# Load model
model = LogAutoEncoder(
    vocab_size=config['vocab_size'],
    embed_dim=config['embed_dim'],
    latent_dim=config['latent_dim']
).to(device)

checkpoint = torch.load('production_model/autoencoder_final.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

threshold = checkpoint['anomaly_threshold']
```

## üìä Model Information

- **Architecture**: LSTM Autoencoder
- **Vocabulary Size**: {VOCAB_SIZE}
- **Embedding Dimension**: 48
- **Latent Dimension**: 24
- **Max Sequence Length**: 128
- **Training Epochs**: 15
- **Final Training Loss**: {:.4f}
- **Anomaly Threshold**: {:.4f} (90th percentile)

## üîß How It Works

1. **Input**: Raw log text
2. **Cleaning**: Apply regex patterns to standardize tokens
3. **Encoding**: Convert words to IDs using vocabulary
4. **Model**: LSTM Autoencoder compresses and reconstructs
5. **Scoring**: Reconstruction error = anomaly score
6. **Decision**: If score > threshold ‚Üí Anomaly

### Interpretation
- **Low score** (~0.5-2.0): Normal log
- **High score** (>3.0): Anomalous/suspicious log
- Threshold is set at the 90th percentile of training data

## üìà Performance

- **Anomalies Detected**: {}/{} logs
- **Anomaly Rate**: {:.2f}%
- **Mean Error**: {:.4f}
- **Std Error**: {:.4f}

## üîç Troubleshooting

### Model not found
- Ensure all files are in the `production_model/` directory
- Check file paths in the code

### Import errors
- Install required packages: `pip install torch numpy`
- Ensure Python version >= 3.8

### Low detection accuracy
- Adjust `ANOMALY_THRESHOLD_PERCENTILE` in the config
- Retrain model with more representative data
- Check if log format matches training data

## üìù Training Metadata

- **Trained on**: {len(results_df):,} logs
- **Checkpoint**: epoch_10.pth
- **Device Used**: {str(DEVICE)}

---
Generated: 2026-01-02
Model version: 1.0
""".format(
    VOCAB_SIZE,
    checkpoint['loss'],
    anomaly_threshold,
    len(anomalies),
    len(results_df),
    (len(anomalies)/len(results_df)*100),
    all_errors.mean(),
    all_errors.std()
)

readme_path = f"{PRODUCTION_DIR}/README.md"
with open(readme_path, 'w') as f:
    f.write(readme_content)

logger.info(f"‚úÖ README saved: {readme_path}")

# Print summary
logger.info(f"\n" + "="*70)
logger.info("‚ú® PRODUCTION MODEL READY FOR DEPLOYMENT!")
logger.info("="*70)
logger.info(f"\nüìÅ All files saved in: {PRODUCTION_DIR}/")
logger.info(f"\nüìã Files created:")
for file in Path(PRODUCTION_DIR).glob("*"):
    size = file.stat().st_size / 1024  # KB
    logger.info(f"   ‚úì {file.name} ({size:.1f} KB)")
