In [1]:
import numpy as np
import os
import sys
sys.path.append(r'C:\Users\Max Tost\Desktop\Notebooks\SPC Neural Network Project')

import matplotlib.pyplot as plt # for plotting
%matplotlib widget
import pandas as pd # for data manipulation
from Models.load_data import *

%load_ext autoreload
%autoreload 2

import torch

In [None]:

from pathlib import Path

path = 
features_list = 
features_sequence = features_sequence = ['SSXcore', 'IPLA', 'DAO_EDG7', 'RNT', 'DAI_EDG7', 'ECE_PF']

# Define directories using pathlib
    features_dir = Path(path) / "features"
    targets_dir  = Path(path) / "targets"
    

In [41]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F  # Import for one-hot encoding

class IndependentCSVDataset(Dataset):
    def __init__(self, data_path, features_list, features_sequence=None, transform=None, seq_length=6000, num_classes=2):
        """
        Loads each CSV file and stores individual samples (row-wise) as tuples of (x, y), 
        while ensuring targets are one-hot encoded.

        Parameters:
        - data_path (str or Path): Base directory containing 'features' and 'targets' subdirectories.
        - features_list (list of str): List of CSV file names to process.
        - features_sequence (list of str): List of feature names to extract from each CSV.
          Defaults to ['SSXcore', 'IPLA', 'DAO_EDG7', 'RNT', 'DAI_EDG7', 'ECE_PF'] if not provided.
        - transform (callable, optional): Optional transform to be applied on the feature data.
        - seq_length (int): Ensures all sequences have the same length.
        - num_classes (int): Number of classes for one-hot encoding.
        """
        if features_sequence is None:
            features_sequence = ['SSXcore', 'IPLA', 'DAO_EDG7', 'RNT', 'DAI_EDG7', 'ECE_PF']
        
        self.samples = []  # List to hold individual (x, y) tuples
        self.transform = transform
        self.num_classes = num_classes  # Store number of classes for one-hot encoding
        
        # Define directories using pathlib
        features_dir = Path(data_path) / "features"
        targets_dir  = Path(data_path) / "targets"
        
        # Process each CSV file in the provided list
        for feature_id in features_list:
            feature_file = features_dir / feature_id
            target_file  = targets_dir / feature_id
            
            # Load the features CSV
            df_features = pd.read_csv(feature_file)
            time_length = len(df_features['time'])

            # Drop sequences with a length different from the desired one
            if time_length != seq_length:
                print(f'Skipping {feature_id}: sequence length {time_length} is unexpected.')
                continue
            
            # Build the feature matrix for the file.
            # For each key in features_sequence, use the column if available, otherwise use zeros.
            x_list = []
            for key in features_sequence:
                if key in df_features.columns:
                    x_list.append(df_features[key].to_numpy())
                else:
                    x_list.append(np.zeros(time_length))
            
            # x_file is a 2D array with shape (time_length, number_of_features)
            x_file = np.column_stack(x_list)
            
            # Load the targets CSV and extract the 'target' column
            y_file = pd.read_csv(target_file)['target'].to_numpy()  # shape: (time_length,)
            
            # Append each shot to the samples list.
            self.samples.append((x_file, y_file))
    
    def __len__(self):
        """Return the total number of samples."""
        return len(self.samples)
    
    def __getitem__(self, idx):
        """Return a single sample as a tuple (x, y) with one-hot encoded target."""
        sample, target = self.samples[idx]
        
        if self.transform:
            sample = self.transform(sample)
        
        # Convert the sample to PyTorch tensor
        sample = torch.tensor(sample, dtype=torch.float32)

        # Convert target to tensor and apply one-hot encoding
        target = torch.tensor(target, dtype=torch.long)  # Ensure integer values
        target = F.one_hot(target, num_classes=self.num_classes).float()  # One-hot encode to (seq_length, num_classes)
        
        return sample, target


In [25]:
# Load the shot data and replace nans with zeros to get arrays of equal length
path = r'C:\Users\Max Tost\Desktop\Notebooks\SPC Neural Network Project\Training_data'
features_list = os.listdir(os.path.join(path, r'features'))


In [4]:
import numpy as np
import torch

def compute_global_minmax(dataset):
    """
    Computes global min and max for each feature across all time series in the dataset.
    
    Assumes that each sample from the dataset is a tuple (sample, target) where
    sample is a torch.Tensor of shape [sequence_length, num_features].
    
    Returns:
    - feature_min: NumPy array of shape (num_features,)
    - feature_max: NumPy array of shape (num_features,)
    """
    all_samples = []
    for sample, _ in dataset:
        if isinstance(sample, torch.Tensor):
            sample = sample.numpy()
        all_samples.append(sample)
    
    all_data = np.concatenate(all_samples, axis=0)
    
    # Compute min and max along axis 0 (for each feature)
    feature_min = np.min(all_data, axis=0)
    feature_max = np.max(all_data, axis=0)
    
    # Avoid division by zero by ensuring feature_max is strictly greater than feature_min
    zero_variance_mask = (feature_max == feature_min)
    feature_max[zero_variance_mask] = feature_min[zero_variance_mask] + 1e-6
    
    return feature_min, feature_max


class GlobalMinMaxNormalize:
    def __init__(self, min_vals, max_vals):
        """
        min_vals: array-like of shape (num_features,)
        max_vals: array-like of shape (num_features,)
        """
        # Convert min and max values to torch tensors for compatibility.
        self.min_vals = torch.tensor(min_vals, dtype=torch.float32)
        self.max_vals = torch.tensor(max_vals, dtype=torch.float32)
    
    def __call__(self, sample):
        """
        Normalizes the input sample using the global min and max values.
        sample: torch.Tensor of shape [sequence_length, num_features]
        Returns the normalized sample.
        """
        if not isinstance(sample, torch.Tensor):
            sample = torch.tensor(sample, dtype=torch.float32)
        return (sample - self.min_vals) / (self.max_vals - self.min_vals)


In [42]:
# This cell takes 15s
# Instantiate the dataset without transformations
dataset = IndependentCSVDataset(path, features_list)

# Split dataset into training and testing sets
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

feature_min, feature_max = compute_global_minmax(train_dataset)
print("Global feature min (train set only):", feature_min)
print("Global feature max (train set only):", feature_max)

global_transform = GlobalMinMaxNormalize(feature_min, feature_max)
train_dataset.dataset.transform = global_transform
test_dataset.dataset.transform = global_transform  # Same transform to prevent data leakage

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)



Skipping JETno87125.csv: sequence length 6001 is unexpected.
Skipping JETno91071.csv: sequence length 6001 is unexpected.
Skipping JETno97612.csv: sequence length 6001 is unexpected.
Skipping JETno99471.csv: sequence length 6001 is unexpected.
Global feature min (train set only): [ 0.0000000e+00 -3.5154782e+06 -6.1130988e+14  0.0000000e+00
 -3.1516222e+14  0.0000000e+00]
Global feature max (train set only): [7.9738760e+00 7.2445923e+03 6.6521422e+17 4.9767345e+16 2.2450910e+18
 1.6388709e+04]


In [8]:
# Check the min and max values across the dataset
normalized_samples = []
for sample, _ in test_dataset:
    normalized_samples.append(sample.numpy())  # Convert to NumPy array

normalized_data = np.concatenate(normalized_samples, axis=0)

# Check min and max for each feature
print("Min values after normalization:", np.min(normalized_data, axis=0))
print("Max values after normalization:", np.max(normalized_data, axis=0))


  sample = torch.tensor(sample, dtype=torch.float32)


Min values after normalization: [0.         0.21418148 0.00084819 0.         0.00022082 0.        ]
Max values after normalization: [7.0778477e-01 9.9990690e-01 1.0003488e+00 2.5629673e-02 6.4141440e-01
 3.1320488e+01]


In [14]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.0):
        """
        LSTM model where the same fully connected layer is applied to each cell state.
        
        Parameters:
        - input_size (int): Number of input features per time step.
        - hidden_size (int): Number of LSTM hidden units.
        - num_layers (int): Number of LSTM layers.
        - output_size (int): Number of output units (1 for binary classification).
        - dropout (float): Dropout probability (only for intermediate LSTM layers).
        """
        super(LSTMModel, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Define LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        # Fully connected layer (shared across all timesteps)
        self.fc = nn.Linear(hidden_size, output_size)

        # Sigmoid activation for binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        """
        Forward pass of the model.

        Parameters:
        - x (torch.Tensor): Input tensor of shape [batch_size, sequence_length, input_size]

        Returns:
        - out (torch.Tensor): Output tensor of shape [batch_size, sequence_length, output_size]
        """

        # Forward propagate LSTM
        # out: shape (batch_size, seq_length, hidden_size)
        out, (_, _) = self.lstm(x)

        # Apply the same fully connected layer to each timestep's cell state
        out = self.fc(out)  # Shape: (batch_size, seq_length, output_size)

        # Apply Sigmoid activation for binary classification
        out = self.sigmoid(out)  # Shape: (batch_size, seq_length, output_size)

        # Remove last dimension to match target size
        out = out.squeeze(-1)

        return out  # Binary values per timestep




In [15]:
# Create an iterator from the DataLoader
train_iterator = iter(train_loader)

# Get a single batch
batch = next(train_iterator)

# Unpack the batch (assuming your dataset returns (inputs, targets))
inputs, targets = batch

print("Inputs shape:", inputs.shape)
print("Targets shape:", targets.shape)


  sample = torch.tensor(sample, dtype=torch.float32)


Inputs shape: torch.Size([32, 6000, 6])
Targets shape: torch.Size([32, 6000])


In [16]:

input_size = 6       # Number of features per time step
hidden_size = 64     # Number of LSTM hidden units
num_layers = 2       # Number of LSTM layers
output_size = 2      # Binary classification
dropout = 0.2

# Instantiate the model
model = LSTMModel(input_size, hidden_size, num_layers, output_size, dropout)

# Print model architecture
print(model)

# Example dummy input: batch_size=32, sequence_length=6000, input_size=6
dummy_input = inputs
dummy_output = model(dummy_input)

print("Output shape:", dummy_output.shape)  # Expected: (32, 6000,) -> one binary value per timestep

LSTMModel(
  (lstm): LSTM(6, 64, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
Output shape: torch.Size([32, 6000])


In [43]:
import torch

def compute_class_weights(train_loader, num_classes=2):
    """
    Iterates through the train_loader, counts the occurrences of each class, 
    and computes class weights for imbalanced classification.

    Args:
        train_loader (DataLoader): PyTorch DataLoader with training data.
        num_classes (int): Number of unique classes. Default is 2 (binary classification).

    Returns:
        class_weights (torch.Tensor): Computed class weights (to be used in BCEWithLogitsLoss).
    """
    class_counts = torch.zeros(num_classes)  # Initialize count array

    # Iterate through the DataLoader
    for _, targets in train_loader:
        targets = targets.view(-1, num_classes)  # Flatten batch and sequence length, keep num_classes

        # Sum occurrences across batch & sequence dimension
        class_counts += targets.sum(dim=0)  # Count occurrences for each class

    # Compute inverse class frequency
    class_weights = 1.0 / class_counts
    class_weights = class_weights / class_weights.sum()  # Normalize (optional)

    print(f"Class counts: {class_counts.tolist()}")
    print(f"Class weights: {class_weights.tolist()}")

    return class_weights


In [None]:
# With 10 rounds of optimisation, this tkes roughly 30 min on my laptop

import torch.optim as optim

# Define loss and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights = compute_class_weights(train_loader)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(device))  # For regression; use CrossEntropyLoss for classification
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Assume `train_loader` is a DataLoader yielding batches of (input, target)
num_epochs = 1

model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # inputs should have shape: [batch_size, seq_length, input_size]
        # targets should have shape: [batch_size, output_size] or [batch_size] (depending on your problem)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}")


In [35]:
import torch
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score



def evaluate_test_metrics(model, test_loader, criterion, device, threshold=0.5):
    """
    Evaluates the test loss and F1-score of a trained model.

    Parameters:
    - model: The trained PyTorch model.
    - test_loader: DataLoader for the test dataset.
    - criterion: Loss function (e.g., nn.CrossEntropyLoss, nn.BCELoss).
    - device: The device ('cuda' or 'cpu').
    - threshold: Decision threshold for binary classification (default=0.5).

    Returns:
    - Average test loss
    - F1-score
    """
    model.eval()  # Set model to evaluation mode
    test_loss = 0.0
    total_samples = 0
    all_targets = []
    all_predictions = []

    with torch.no_grad():  # Disable gradient calculation for efficiency
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to device

            # Forward pass
            outputs = model(inputs)

            # Compute loss
            loss = criterion(outputs, targets)
            test_loss += loss.item() * inputs.size(0)  # Multiply by batch size
            total_samples += inputs.size(0)

            # Convert model outputs to predictions
            if outputs.shape[-1] == 1:  # Binary classification case
                preds = (outputs.squeeze(-1) > threshold).int()
            else:  # Multi-class classification (if one-hot, convert targets)
                preds = torch.argmax(outputs, dim=-1)

                # Convert one-hot encoded targets to class indices
                if targets.ndim > 1 and targets.shape[-1] > 1:  # One-hot case
                    targets = torch.argmax(targets, dim=-1)

            all_targets.extend(targets.cpu().numpy())  # Convert to numpy
            all_predictions.extend(preds.cpu().numpy())  # Convert to numpy

    # Compute average loss
    avg_test_loss = test_loss / total_samples

    # Compute F1-score
    f1 = f1_score(all_targets, all_predictions, average='macro')  # Use 'macro' for class balance
    auc = roc_auc_score(all_targets, all_predictions)


    return avg_test_loss, f1, auc

# Example Usage:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_loss, test_f1, auc = evaluate_test_metrics(model, test_loader, criterion, device)

print(f"Test Loss: {test_loss:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"AUC-ROC: {auc:.4f}")


  sample = torch.tensor(sample, dtype=torch.float32)


Test Loss: 0.0963
F1 Score: 0.4286
AUC-ROC: 0.5000


In [18]:
len(dataset)

300

In [31]:
for batch_idx, (inputs, targets) in enumerate(train_loader):
    if len(targets[0])!=6000:
        print('Still an array that has wrong length', )

  sample = torch.tensor(sample, dtype=torch.float32)
