In [1]:
import torch
import re
import math
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer
from weather_gru_models import BasicWeatherGRU
from weather_datasets import StandardWeatherDataset, prepare_weather_dataset, validate_and_clean_data_multithreaded, reduce_vocabulary
from tqdm.notebook import tqdm
import os
import json
import random
from collections import Counter

# Set random seed for reproducibility
torch.manual_seed(42)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')

# Special tokens for text substitution
special_tokens = {
    'additional_special_tokens': ['<city>','<temp>','<date>','<velocity>','<percentile>','<rainfall>', '<ne>']
}

# Add special tokens into tokenizer
tokenizer.add_special_tokens(special_tokens)

7

In [None]:
if __name__=='__main__':
    
    # change directory if not on root
    if str(os.getcwd()).endswith('LLamas') == False:
        os.chdir('..')
    
    # Define file validation function
    def check_file(path):
        """
        Checks if the file contains valid keys and values for StandardWeatherDataset.
        Returns True if the file should be loaded, False otherwise.
        
        Args:
            path (str): Path to the JSON file
            
        Returns:
            bool: Whether file is valid
        """
        with open(path, 'r', encoding='utf-8') as f:
            try:
                data: dict = json.load(f)
            except json.JSONDecodeError:
                print(f"Invalid JSON in file: {path}")
                return False

            # Ensure required keys are present
            if not {'report_long', 'city'}.issubset(data.keys()):
                return False

            # Check if 'city' and 'report_long' have valid non-empty values
            if not isinstance(data['city'], str) or not data['city'].strip():
                return False
            if not isinstance(data['report_long'], str) or not data['report_long'].strip():
                return False

            return True
                
    # Load data from file
    def load_data(path):
        """
        Load JSON data from file path.
        
        Args:
            path (str): Path to the JSON file
            
        Returns:
            dict: Loaded data dictionary
        """
        with open(path, 'r', encoding='utf-8') as f:
            data: dict = json.load(f)
            return data
    
    # List files for reading
    files = os.listdir(os.path.join(os.getcwd(), 'data', 'files_for_chatGPT', '2024-12-12'))
    
    # Load files with validation
    files = {(file.split('-')[-1]).split('_')[0]:load_data(os.path.join(os.getcwd(), 'data', 'files_for_chatGPT', '2024-12-12', file)) 
             for file in tqdm(files) 
             if check_file(os.path.join(os.getcwd(), 'data', 'files_for_chatGPT', '2024-12-12', file))}
    
    # Convert dictionary values to list for dataset creation
    weather_data = list(files.values())
    
    # Create StandardWeatherDataset
    dataset = StandardWeatherDataset(weather_data, max_length=100)
    
    # Scan dataset for named entities (optional)
    entity_counts = dataset.scan_dataset_for_named_entities(sample_size=1000)
    print(entity_counts)
    
    # Clean the dataset using the standardized validation function
    clean_dataset, valid_indices = validate_and_clean_data_multithreaded(dataset, 11)

    # Output validation summary
    print(f"\nDataset validation complete. Kept {len(valid_indices)} valid samples out of {len(dataset)}")

In [5]:
def count_model_parameters(model):
    """
    Count the number of trainable parameters in the model.
    
    Args:
        model (nn.Module): PyTorch model
        
    Returns:
        int: Number of trainable parameters
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def adjust_model_size(feature_dim, vocab_size, embedding_dim, hidden_dim, max_params=30_000_000):
    """
    Adjust model dimensions to stay under parameter limit.
    
    Args:
        feature_dim (int): Dimension of input features
        vocab_size (int): Size of vocabulary
        embedding_dim (int): Initial embedding dimension
        hidden_dim (int): Initial hidden dimension
        max_params (int): Maximum parameter limit
        
    Returns:
        tuple: (adjusted_embedding_dim, adjusted_hidden_dim)
    """
    import math
    
    # Calculate WeatherGRU model parameter count
    # Main parameters:
    # - Feature encoder: feature_dim * embedding_dim * 2 (two linear layers)
    # - Embedding: vocab_size * embedding_dim
    # - GRU: embedding_dim * hidden_dim * 3 (input projection) + hidden_dim * hidden_dim * 3 (recurrent)
    # - Output layer: hidden_dim * vocab_size
    
    param_count = (
        feature_dim * embedding_dim * 2 +  # Feature encoder (two layers)
        vocab_size * embedding_dim +       # Embedding layer
        (embedding_dim * hidden_dim * 3) + # GRU input projection (3 gates)
        (hidden_dim * hidden_dim * 3) +    # GRU recurrent connections (3 gates)
        hidden_dim * vocab_size            # Output layer
    )
    
    print(f"Initial parameter count: {param_count:,}")
    
    if param_count <= max_params:
        return embedding_dim, hidden_dim
    
    # If we need to reduce params, try to find optimal dimensions
    # Strategy: reduce both embedding_dim and hidden_dim proportionally
    
    # Calculate reduction factor needed
    reduction_factor = math.sqrt(max_params / param_count)
    
    # Apply reduction
    new_embedding_dim = max(32, int(embedding_dim * reduction_factor))
    new_hidden_dim = max(64, int(hidden_dim * reduction_factor))
    
    # Recalculate to verify
    new_param_count = (
        feature_dim * new_embedding_dim * 2 +  
        vocab_size * new_embedding_dim +       
        (new_embedding_dim * new_hidden_dim * 3) + 
        (new_hidden_dim * new_hidden_dim * 3) +    
        new_hidden_dim * vocab_size            
    )
    
    print(f"Adjusted dimensions: embedding_dim={new_embedding_dim}, hidden_dim={new_hidden_dim}")
    print(f"Adjusted parameter count: {new_param_count:,}")
    
    return new_embedding_dim, new_hidden_dim

def save_model_with_metadata(model, path, token_mappings=None, config=None):
    """
    Save model with all required metadata for later use.
    
    Args:
        model (nn.Module): Model to save
        path (str): Path to save model
        token_mappings (dict, optional): Token ID mappings for reduced vocabulary
        config (dict, optional): Model configuration parameters
        
    Returns:
        str: Path where model was saved
    """
    import torch
    import os
    
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(path), exist_ok=True)
    
    # Prepare save data
    save_data = {
        'model_state_dict': model.state_dict(),
    }
    
    # Add token mappings if available
    if token_mappings is not None:
        save_data['token_mappings'] = token_mappings
    
    # Add model configuration if available
    if config is not None:
        save_data['config'] = config
    
    # Save to file
    torch.save(save_data, path)
    print(f"Model saved to {path}")
    
    return path

def load_model_with_metadata(path, device='cpu'):
    """
    Load model with all metadata.
    
    Args:
        path (str): Path to the saved model file
        device (str): Device to load model to
        
    Returns:
        dict: Dictionary containing model state_dict, token_mappings, and config
    """
    import torch
    
    # Load saved data
    save_data = torch.load(path, map_location=device)
    
    return save_data

def create_dataloader_with_mapping(dataset, batch_size, tokenizer, token_id_map=None):
    """
    Create a DataLoader with optional token mapping for reduced vocabulary.
    
    Args:
        dataset: Weather dataset
        batch_size (int): Batch size
        tokenizer: Tokenizer for text encoding
        token_id_map (dict, optional): Mapping for token IDs in reduced vocabulary
        
    Returns:
        DataLoader: Configured data loader
    """
    def prepare_batch_with_mapping(batch_list, tokenizer, token_id_map=None):
        """
        Prepare batch with optional token mapping for reduced vocabulary.
        
        Args:
            batch_list (list): List of sample dictionaries
            tokenizer: Tokenizer for text encoding
            token_id_map (dict, optional): Token ID mapping for reduced vocabulary
            
        Returns:
            dict: Prepared batch with features and text tokens
        """
        features = torch.stack([item['features'] for item in batch_list])
        texts = [item['text'] for item in batch_list]
        
        # Normalize features
        features = (features - features.mean()) / (features.std() + 1e-8)
        
        encoded = tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors='pt'
        )
        
        batch = {
            'features': features,
            'text': encoded['input_ids']
        }
        
        # Apply token mapping if provided
        if token_id_map is not None:
            # Map the token IDs to new IDs
            old_tokens = batch['text']
            new_tokens = torch.zeros_like(old_tokens)
            
            for i in range(old_tokens.size(0)):
                for j in range(old_tokens.size(1)):
                    old_id = old_tokens[i, j].item()
                    new_tokens[i, j] = token_id_map.get(old_id, 0)  # Default to 0 if token not found
            
            batch['text'] = new_tokens
        
        return batch
    
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=lambda batch: prepare_batch_with_mapping(batch, tokenizer, token_id_map)
    )

def train_epoch(model, dataloader, criterion, optimizer, device, teacher_forcing_ratio=1.0, epoch=0, total_epochs=1):
    """
    Train the model for one epoch.
    
    Args:
        model (nn.Module): Model to train
        dataloader (DataLoader): DataLoader with training data
        criterion: Loss function
        optimizer: Optimizer
        device: Device to train on
        teacher_forcing_ratio (float): Probability of using teacher forcing
        epoch (int): Current epoch number
        total_epochs (int): Total number of epochs
        
    Returns:
        float: Average loss for the epoch
    """
    model.train()
    total_loss = 0
    num_batches = 0
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{total_epochs}")
    
    for batch_idx, batch in enumerate(pbar):
        try:
            features = batch['features'].to(device)
            text = batch['text'].to(device)
            
            # Check for invalid values in inputs
            if torch.isnan(features).any() or torch.isinf(features).any():
                print(f"Warning: Invalid values in features at batch {batch_idx}")
                continue
                
            optimizer.zero_grad()
            
            # Forward pass with gradient checking
            with torch.autograd.detect_anomaly():
                outputs = model(features, text, teacher_forcing_ratio)
                outputs = outputs.view(-1, outputs.size(-1))
                targets = text[:, 1:].contiguous().view(-1)
                
                loss = criterion(outputs, targets)
                
                # Check if loss is valid
                if torch.isnan(loss) or torch.isinf(loss):
                    print(f"Warning: Invalid loss value {loss.item()} at batch {batch_idx}")
                    print("Last output values:", outputs[-5:])
                    print("Last target values:", targets[-5:])
                    raise ValueError("Invalid loss detected")
                
                loss.backward()
                
                # Clip gradients
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                # Check gradients
                for name, param in model.named_parameters():
                    if param.grad is not None:
                        grad_norm = param.grad.norm()
                        if torch.isnan(grad_norm) or torch.isinf(grad_norm):
                            print(f"Warning: Invalid gradient for {name}")
                            raise ValueError(f"Invalid gradient detected in {name}")
                
                optimizer.step()
                
                total_loss += loss.item()
                num_batches += 1
                avg_loss = total_loss / num_batches
                pbar.set_postfix({"Loss": f"{avg_loss:.4f}"})
                
        except ValueError as e:
            print(f"Error in batch {batch_idx}: {str(e)}")
            continue
            
    return total_loss / num_batches if num_batches > 0 else float('inf')

def train_model(dataset, tokenizer, max_params=3_000_000, num_epochs=10, batch_size=64, learning_rate=1e-3):
    """
    Train a GRU model on the weather dataset.
    
    Args:
        dataset: Weather dataset
        tokenizer: Tokenizer for text encoding
        max_params (int): Maximum number of parameters for the model
        num_epochs (int): Number of training epochs
        batch_size (int): Batch size for training
        learning_rate (float): Learning rate for optimizer
        
    Returns:
        tuple: (trained_model, token_mappings, training_losses)
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Set seed for reproducibility
    torch.manual_seed(42)
    random.seed(42)
    np.random.seed(42)
    
    # First, reduce vocabulary
    token_mappings, reduced_vocab_size = reduce_vocabulary(tokenizer, dataset, batch_size)
    
    # Get feature dimension
    feature_dim = dataset.feature_dim
    
    # Determine optimal model dimensions based on parameter constraints
    embedding_dim, hidden_dim = adjust_model_size(
        feature_dim=feature_dim,
        vocab_size=reduced_vocab_size,
        embedding_dim=256,  # Starting point
        hidden_dim=512,     # Starting point
        max_params=max_params
    )
    
    # Create model with adjusted dimensions
    model = BasicWeatherGRU(
        feature_dim=feature_dim,
        vocab_size=reduced_vocab_size,
        embedding_dim=embedding_dim,
        hidden_dim=hidden_dim,
        n_layers=1,
        dropout=0.2
    ).to(device)
    
    # Count parameters
    param_count = count_model_parameters(model)
    print(f"Model has {param_count:,} trainable parameters")
    
    # Create dataloader with token mapping
    dataloader = create_dataloader_with_mapping(
        dataset, 
        batch_size=batch_size, 
        tokenizer=tokenizer,
        token_id_map=token_mappings['token_id_map']
    )
    
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='mean')
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    
    # Add learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, verbose=True
    )
    
    losses = []
    best_loss = float('inf')
    
    # Select a few examples for generation testing
    test_indices = [0, len(dataset)//2, len(dataset)-1]  # Beginning, middle, and end
    test_samples = [dataset[i] for i in test_indices]
    test_features = torch.stack([sample['features'] for sample in test_samples]).to(device)
    
    print("\nOriginal texts for test samples:")
    for idx, sample in zip(test_indices, test_samples):
        print(f"Sample {idx}: {sample['text']}")
    
    # Create directory for model checkpoints
    os.makedirs('models', exist_ok=True)
    
    for epoch in range(num_epochs):
        try:
            loss = train_epoch(
                model, dataloader, criterion, optimizer, device,
                teacher_forcing_ratio=0.9,
                epoch=epoch, total_epochs=num_epochs
            )
            losses.append(loss)
            
            scheduler.step(loss)
            print(f"\nEpoch {epoch + 1}, Loss: {loss:.4f}")
            
            # Generate examples
            print("\nGenerated examples:")
            model.eval()
            with torch.no_grad():
                generated_tokens = model.generate(
                    test_features,
                    token_mappings=token_mappings
                )
                for idx, tokens in enumerate(generated_tokens):
                    # Map tokens back to original vocabulary
                    original_tokens = [token_mappings['reverse_token_id_map'][t.item()] for t in tokens]
                    generated_text = tokenizer.decode(original_tokens, skip_special_tokens=False)
                    print(f"Sample {test_indices[idx]}: {generated_text}")
            model.train()
            
            # Save checkpoint if it's the best model so far
            if loss < best_loss:
                best_loss = loss
                config = {
                    'feature_dim': feature_dim,
                    'vocab_size': reduced_vocab_size,
                    'embedding_dim': embedding_dim,
                    'hidden_dim': hidden_dim,
                    'n_layers': 1,
                    'dropout': 0.2,
                    'epoch': epoch,
                    'loss': loss
                }
                
                save_model_with_metadata(
                    model=model,
                    path=os.path.join('models', 'best_gru_model.pt'),
                    token_mappings=token_mappings,
                    config=config
                )
                
        except Exception as e:
            print(f"Error during epoch {epoch + 1}: {str(e)}")
            continue
    
    return model, token_mappings, losses

In [None]:
# Usage example:
# Note that we now call train_model differently, passing dataset and tokenizer directly
model, token_mappings, losses = train_model(
    dataset=clean_dataset,
    tokenizer=tokenizer,
    max_params=30_000_000,  # 30 million parameter limit
    num_epochs=1,           # tends to overfit after 1 epoch
    batch_size=128,
    learning_rate=1e-3
)

In [None]:
def load_and_run_weather_model(model_path, dataset, sample_indices=None, num_samples=3, tokenizer=None, max_length=100):
    """
    Load the trained GRU weather model and generate text based on features from the provided dataset.
    
    Args:
        model_path (str): Path to the saved model file
        dataset: The cleaned dataset containing features
        sample_indices (list, optional): Specific indices to use from the dataset. If None, random samples are selected.
        num_samples (int): Number of random samples to generate if sample_indices is None
        tokenizer (AutoTokenizer, optional): Tokenizer to use for decoding. If None, will load BERT German tokenizer.
        max_length (int): Maximum length of the generated sequence
        
    Returns:
        dict: Dictionary mapping sample indices to original and generated text
    """
    # Load tokenizer if not provided
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')
        # Add special tokens that were used during training
        special_tokens = {
            'additional_special_tokens': ['<city>','<temp>','<date>','<velocity>','<percentile>','<rainfall>', '<ne>']
        }
        tokenizer.add_special_tokens(special_tokens)
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load model and metadata
    save_data = torch.load(model_path, map_location=device)
    token_mappings = save_data['token_mappings']
    config = save_data['config']
    
    # Create model with saved configuration
    model = BasicWeatherGRU(
        feature_dim=config['feature_dim'],
        vocab_size=config['vocab_size'],
        embedding_dim=config['embedding_dim'],
        hidden_dim=config['hidden_dim'],
        n_layers=config['n_layers'],
        dropout=config['dropout']
    ).to(device)
    
    # Load state dict
    model.load_state_dict(save_data['model_state_dict'])
    model.eval()
    
    # Select samples from the dataset
    if sample_indices is None:
        import random
        # Choose random indices
        sample_indices = random.sample(range(len(dataset)), min(num_samples, len(dataset)))
    
    # Extract features and original texts
    samples = [dataset[i] for i in sample_indices]
    features = torch.stack([sample['features'] for sample in samples]).to(device)
    original_texts = [sample['text'] for sample in samples]
    
    # Generate text
    with torch.no_grad():
        generated_tokens = model.generate(
            features,
            max_length=max_length,
            token_mappings=token_mappings
        )
        
        # Convert tokens back to text
        results = {}
        for idx, (tokens, original) in enumerate(zip(generated_tokens, original_texts)):
            # Map tokens back to original vocabulary
            original_tokens = [token_mappings['reverse_token_id_map'][t.item()] for t in tokens]
            text = tokenizer.decode(original_tokens, skip_special_tokens=False)
            # Clean up text (remove extra tokens or artifacts if needed)
            text = re.sub(r'\[CLS\]|\[SEP\]', '', text).strip()
            
            sample_idx = sample_indices[idx]
            results[sample_idx] = {
                'original': original,
                'generated': text
            }
    
    return results

# Example usage:
# results = load_and_run_weather_model("models/best_gru_model.pt", clean_dataset)
# for idx, data in results.items():
#     print(f"Sample {idx}:")
#     print(f"Original: {data['original']}")
#     print(f"Generated: {data['generated']}")
#     print()