In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from transformers import AutoformerConfig, AutoformerForPrediction
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import os
import logging
import time
from tqdm import tqdm
import sys
sys.path.insert(1, '../src/')
from config import raw_data_path, univariate_data_path, processed_data_path, models_path
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoformerConfig, AutoformerForPrediction
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split


In [13]:
data_file = os.path.join(univariate_data_path, 'merged_univariate.npy')
data = np.load(data_file, allow_pickle=True)

In [14]:
# Keep only signals that are >= 10 minutes (12000 samples at 20Hz)
filtered_data = [record for record in data if len(record['signal']) >= 12000]

print(f"Filtered dataset size: {len(filtered_data)} (out of {len(data)})")
train_data, temp_data = train_test_split(filtered_data, test_size=0.2, shuffle=True, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, shuffle=True, random_state=42)

print(train_data[0])

Filtered dataset size: 659 (out of 666)
{'record_name': 'ice008_p_4of4', 'signal': array([ 40.91904   ,  40.44264156,  39.96763369, ..., -13.16801657,
       -13.16801289, -13.16801   ]), 'metadata': {'fs': 200, 'sig_len': 786000, 'n_sig': 16, 'base_date': None, 'base_time': None, 'units': ['mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV'], 'comments': ['Info:', 'ID:ice008', 'Record type:pregnancy', 'Record number:4/4', 'Age(years):26', 'BMI before pregnancy:20.9', 'BMI at recording:25.3', 'Gravidity:5', 'Parity:2', 'Previous caesarean:No', 'Placental position:Anterior', 'Gestational age at recording(w/d):40/1', 'Gestational age at delivery:40/2', 'Mode of delivery:Vaginal', 'Synthetic oxytocin use in labour:No', 'Epidural during labour:No', 'Comments for recording:', 'A contraction had just started at the beginning of this recording.', 'Baby was born 17 hours and 20 minutes after the end of this recording.', 'Comments for delivery:']}}


In [15]:

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [16]:
def prepare_dataloaders(train_data, val_data, test_data, batch_size=16, window_size=12000, prediction_length=1200, stride=400):
    """Prepare DataLoaders for training, validation and testing"""
    # Extract signal data from the dictionaries
    train_signals = [item['signal'] for item in train_data]
    val_signals = [item['signal'] for item in val_data]
    test_signals = [item['signal'] for item in test_data]
    
    print(f"Number of training signals: {len(train_signals)}")
    print(f"Sample signal length: {len(train_signals[0])}")
    
    # Concatenate all signals into single arrays for each split
    # Another approach would be to create separate datasets for each signal
    train_signal_concat = np.concatenate(train_signals)
    val_signal_concat = np.concatenate(val_signals)
    test_signal_concat = np.concatenate(test_signals)
    
    print(f"Concatenated training signal length: {len(train_signal_concat)}")
    
    # Create datasets with the extracted signals
    train_dataset = TimeSeriesDataset(train_signal_concat, window_size, prediction_length, stride)
    val_dataset = TimeSeriesDataset(val_signal_concat, window_size, prediction_length, stride)
    test_dataset = TimeSeriesDataset(test_signal_concat, window_size, prediction_length, stride)
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=collate_fn,
        num_workers=2,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        collate_fn=collate_fn,
        num_workers=2,
        pin_memory=True
    )
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        shuffle=False, 
        collate_fn=collate_fn,
        num_workers=2,
        pin_memory=True
    )
    
    return train_loader, val_loader, test_loader, train_dataset, val_dataset, test_dataset

In [17]:
class TimeSeriesDataset(Dataset):
    """Dataset for loading time series data with sliding windows for forecasting."""
    
    def __init__(self, data, window_size=12000, prediction_length=1200, stride=400):
        """
        Args:
            data: Time series data (numpy array)
            window_size: Size of input context window (in data points)
            prediction_length: Number of future points to predict
            stride: Step size between windows
        """
        # Ensure data is a numpy array
        self.data = data if isinstance(data, np.ndarray) else np.array(data, dtype=np.float32)
            
        # Ensure data is 1D
        if len(self.data.shape) > 1:
            self.data = self.data.flatten()
            
        self.window_size = window_size
        self.prediction_length = prediction_length
        self.stride = stride
        
        # Check if we have enough data
        if len(self.data) < window_size + prediction_length:
            raise ValueError(f"Not enough data points. Have {len(self.data)}, need at least {window_size + prediction_length}")
        
        # Create indices for sliding windows
        self.indices = []
        for i in range(0, len(self.data) - window_size - prediction_length + 1, stride):
            self.indices.append(i)
            
        if len(self.indices) == 0:
            raise ValueError("No valid windows could be created with the current parameters")
            
        # Standardize the data
        self.scaler = StandardScaler()
        self.scaled_data = self.scaler.fit_transform(self.data.reshape(-1, 1)).flatten()
        
        print(f"Created dataset with {len(self.indices)} windows")
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        start_idx = self.indices[idx]
        end_idx = start_idx + self.window_size
        future_idx = end_idx + self.prediction_length
        
        # Get context window and prediction target
        context = self.scaled_data[start_idx:end_idx]
        target = self.scaled_data[end_idx:future_idx]
        
        return {
            'context': torch.FloatTensor(context),
            'target': torch.FloatTensor(target)
        }
    
    def inverse_transform(self, scaled_data):
        """Convert scaled data back to original scale"""
        if isinstance(scaled_data, torch.Tensor):
            scaled_data = scaled_data.numpy()
        return self.scaler.inverse_transform(scaled_data.reshape(-1, 1)).flatten()

In [18]:

def collate_fn(batch):
    """Custom collate function to handle batches of time series"""
    contexts = torch.stack([item['context'] for item in batch])
    targets = torch.stack([item['target'] for item in batch])
    
    # Reshape for Autoformer (batch_size, seq_len, features)
    contexts = contexts.unsqueeze(-1)
    targets = targets.unsqueeze(-1)
    
    return {
        'context': contexts,
        'target': targets
    }


def train_model(model, train_loader, val_loader, device, num_epochs=10, learning_rate=1e-4):
    """Train the Autoformer model"""
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
    
    best_val_loss = float('inf')
    best_model_path = 'best_autoformer_model.pth'
    
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        start_time = time.time()
        
        # Training step
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch_idx, batch in enumerate(progress_bar):
            context = batch['context'].to(device)
            target = batch['target'].to(device)
            
            # Create observed mask (all True since we're assuming all past values are observed)
            past_observed_mask = torch.ones_like(context, dtype=torch.bool).to(device)
            
            optimizer.zero_grad()
            
            # Forward pass with past_observed_mask
            outputs = model(
                past_values=context,
                past_observed_mask=past_observed_mask,  # Add this parameter
                past_time_features=None,  # We're not using explicit time features here
                future_values=None,  # During training, we don't provide future values
                future_time_features=None,
            )
            
            # Extract predictions
            predictions = outputs.prediction_outputs
            
            # Calculate loss (MSE)
            loss = criterion(predictions, target)
            
            # Backward pass and optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
        
        # Calculate average training loss
        avg_train_loss = total_loss / len(train_loader)
        logger.info(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.6f}")
        
        # Validation step
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch in val_loader:
                context = batch['context'].to(device)
                target = batch['target'].to(device)
                
                # Create observed mask for validation
                past_observed_mask = torch.ones_like(context, dtype=torch.bool).to(device)
                
                outputs = model(
                    past_values=context,
                    past_observed_mask=past_observed_mask,  # Add this parameter
                    past_time_features=None,
                    future_values=None,
                    future_time_features=None,
                )
                
                predictions = outputs.prediction_outputs
                loss = criterion(predictions, target)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        logger.info(f"Epoch {epoch+1}/{num_epochs}, Val Loss: {avg_val_loss:.6f}, Time: {time.time() - start_time:.2f}s")
        
        # Update learning rate based on validation loss
        scheduler.step(avg_val_loss)
        
        # Save the best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), best_model_path)
            logger.info(f"Saved best model with validation loss: {best_val_loss:.6f}")
    
    # Load the best model
    model.load_state_dict(torch.load(best_model_path))
    return model

def evaluate_model(model, test_loader, test_dataset, device):
    """Evaluate the trained model on test data"""
    model.eval()
    criterion = nn.MSELoss()
    total_loss = 0
    
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            context = batch['context'].to(device)
            target = batch['target'].to(device)
            
            # Create observed mask for testing
            past_observed_mask = torch.ones_like(context, dtype=torch.bool).to(device)
            print('type model: ', model.type)
            outputs = model(
                past_values=context,
                past_observed_mask=past_observed_mask,  # Add this parameter
                past_time_features=None,
                future_values=None,
                future_time_features=None,
            )
            
            predictions = outputs.prediction_outputs
            loss = criterion(predictions, target)
            total_loss += loss.item()
            
            # Store predictions and targets for visualization
            all_predictions.append(predictions.cpu().numpy())
            all_targets.append(target.cpu().numpy())
    
    avg_test_loss = total_loss / len(test_loader)
    logger.info(f"Test Loss: {avg_test_loss:.6f}")
    
    # Convert predictions and targets back to original scale
    all_predictions = np.concatenate([p.reshape(-1, 1) for p in all_predictions], axis=0)
    all_targets = np.concatenate([t.reshape(-1, 1) for t in all_targets], axis=0)
    
    # Visualize some predictions
    sample_idx = np.random.randint(0, len(all_predictions), 3)
    
    plt.figure(figsize=(15, 10))
    for i, idx in enumerate(sample_idx):
        pred = test_dataset.inverse_transform(all_predictions[idx].flatten())
        true = test_dataset.inverse_transform(all_targets[idx].flatten())
        
        plt.subplot(3, 1, i+1)
        plt.plot(true, label='Ground Truth', color='blue')
        plt.plot(pred, label='Prediction', color='red', linestyle='--')
        plt.legend()
        plt.title(f'Sample {idx}')
    
    plt.tight_layout()
    plt.savefig('forecast_predictions.png')
    plt.close()
    
    return avg_test_loss, all_predictions, all_targets

if __name__ == "__main__":
    main()

NameError: name 'main' is not defined