In [1]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve, precision_recall_curve
from torch.nn.utils import clip_grad_norm_
import matplotlib.pyplot as plt
from torch_geometric.nn import BatchNorm
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
import seaborn as sns
from scipy.stats import skew, kurtosis, linregress
from scipy.signal import find_peaks
import einops

import lovely_tensors as lt
lt.monkey_patch()



In [2]:
#%% Data Loading and Preprocessing
print("Loading and preprocessing data...")
graph_data = torch.load('data/graph_data_torch.pt')

Loading and preprocessing data...


  graph_data = torch.load('data/graph_data_torch.pt')


In [3]:
graph_data

{'scenario': tensor[17520000, 1] i32 67Mb x∈[1, 1000] μ=500.500 σ=288.675,
 'node_features': tensor[17520000, 32] n=560640000 (2.1Gb) x∈[0., 69.985] μ=61.383 σ=11.861,
 'edge_attr': tensor[17520000, 34] n=595680000 (2.2Gb) x∈[-3.726e+03, 1.344e+04] μ=835.518 σ=1.509e+03,
 'edge_index': tensor[34, 2] i64 n=68 x∈[0, 31] μ=15.471 σ=9.229,
 'y': tensor[17520000, 32] i64 n=560640000 (4.2Gb) x∈[0, 1] μ=0.008 σ=0.087}

In [4]:
graph_data["edge_attr"] = torch.cat([graph_data["edge_attr"], graph_data["edge_attr"]], dim=1)
edges_reversed = torch.stack([reversed(e) for e in graph_data["edge_index"].reshape(-1, 2)])
edge_index = torch.cat([graph_data["edge_index"], edges_reversed], dim=0)
graph_data["edge_index"] = edge_index

In [28]:
# Temporal parameters
WINDOW_SIZE = 12     
SUBSAMPLE_FACTOR = 10  # Reduced subsampling for more windows
num_nodes = graph_data['node_features'].shape[1]

def preprocess_data(features, window_size, subsample):
    """Process features into [windows, nodes] format with mean, max, min, and std aggregation"""
    # Slice data into overlapping windows
    features = features.unfold(0, window_size, window_size).float()
    
    # Compute mean, max, min, and std pooling within each window
    mean_values = features.mean(dim=2)  # Mean pooling
    max_values = features.max(dim=2).values  # Max pooling
    min_values = features.min(dim=2).values  # Min pooling
    std_values = features.std(dim=2)  # Standard deviation pooling
    
    # Combine mean, max, min, and std into a single representation
    combined_features = torch.cat((mean_values, max_values, min_values, std_values), dim=1)  # Concatenate along feature axis
    
    # Subsample the windows
    return combined_features[::subsample, :]

In [29]:
# Process features and labels
node_features = preprocess_data(graph_data['node_features'], WINDOW_SIZE, SUBSAMPLE_FACTOR)
edge_features = preprocess_data(graph_data['edge_attr'], WINDOW_SIZE, SUBSAMPLE_FACTOR)
window_labels = graph_data['y'].unfold(0, WINDOW_SIZE, WINDOW_SIZE).max(dim=2).values[::SUBSAMPLE_FACTOR, :]
window_scenarios = graph_data['scenario'].unfold(0, WINDOW_SIZE, WINDOW_SIZE).mode(dim=2).values[::SUBSAMPLE_FACTOR, :]
print(node_features)
print(edge_features)
print(window_labels)
print(window_scenarios)

tensor[146000, 128] n=18688000 (71Mb) x∈[0., 69.984] μ=46.181 σ=27.628
tensor[146000, 272] n=39712000 (0.1Gb) x∈[-3.676e+03, 1.337e+04] μ=687.584 σ=1.380e+03
tensor[146000, 32] i64 n=4672000 (36Mb) x∈[0, 1] μ=0.008 σ=0.088
tensor[146000, 1] i32 0.6Mb x∈[1, 1000] μ=500.502 σ=288.676


In [13]:
torch.save({
    'node_features': node_features,
    'window_labels': window_labels,
    "edge_features": edge_features,
    'window_scenarios': window_scenarios,
    "edge_index": graph_data['edge_index'].long().t().contiguous(),
},
    "data/processed_data_W12_S10.pt"           
)

## More Features

In [None]:
# Temporal parameters
WINDOW_SIZE = 12     
SUBSAMPLE_FACTOR = 10  # Reduced subsampling for more windows
num_nodes = graph_data['node_features'].shape[1]
num_edges = graph_data['edge_index'].shape[0]

def preprocess_data(features, window_size, subsample):
    """Process features into [windows, nodes] format with mean, max, min, std, and additional features"""
    # Slice data into overlapping windows
    features = features.unfold(0, window_size, window_size).float()
    
    print("Extracting mean, max, min, and std pooling within each window...")
    # Compute mean, max, min, and std pooling within each window
    mean_values = features.mean(dim=1)  # Mean pooling
    max_values = features.max(dim=1).values  # Max pooling
    min_values = features.min(dim=1).values  # Min pooling
    std_values = features.std(dim=1)  # Standard deviation pooling
    
    print("Extracting median, variance from each window...")
    # Additional statistical features
    median_values = features.median(dim=1).values  # Median pooling
    variance_values = features.var(dim=1)  # Variance pooling
    
    print("Extracting skewness and kurtosis from each window...")
    # Calculate skewness and kurtosis using scipy.stats
    skewness_values = torch.tensor([skew(window.numpy()) for window in tqdm(features, desc="Calculating skewness")])  # Skewness
    kurtosis_values = torch.tensor([kurtosis(window.numpy()) for window in tqdm(features, desc="Calculating kurtosis")])  # Kurtosis
    
    print("Extracting temporal features (autocorrelation, linear trend, zero-crossings, peaks)...")
    # Temporal features
    autocorrelation_values = torch.tensor([np.correlate(window, window, mode='full')[len(window) - 1] / len(window) 
                                           for window in tqdm(features.numpy(), desc="Calculating autocorrelation")])  # Autocorrelation at lag 0
    linear_trend_values = torch.tensor([linregress(np.arange(len(window)), window.numpy()).slope 
                                        for window in tqdm(features, desc="Calculating linear trend")])  # Linear trend (slope)
    zero_crossings_values = torch.tensor([((window[:-1] * window[1:]) < 0).sum().item() 
                                          for window in tqdm(features, desc="Calculating zero-crossings")])  # Zero-crossings
    peaks_values = torch.tensor([len(find_peaks(window.numpy())[0]) for window in tqdm(features.numpy(), desc="Finding peaks")])  # Number of peaks
    
    print("Combining all features into a single representation...")
    # Combine all features into a single representation
    combined_features = torch.cat((mean_values, max_values, min_values, std_values, 
                                   median_values, variance_values, skewness_values.unsqueeze(1), 
                                   kurtosis_values.unsqueeze(1), autocorrelation_values.unsqueeze(1), 
                                   linear_trend_values.unsqueeze(1), zero_crossings_values.unsqueeze(1), 
                                   peaks_values.unsqueeze(1)), dim=1)  # Concatenate along feature axis
    
    print("Subsampling the windows...")
    # Subsample the windows
    return combined_features[::subsample, :]

In [None]:
# Process features and labels
print("Extracting Node Fetures")
node_features = einops.rearrange(graph_data['node_features'], "l n -> (l n)")
node_features = preprocess_data(node_features, WINDOW_SIZE, SUBSAMPLE_FACTOR)
node_features = einops.rearrange(node_features, "(l n) f -> l n f", n=num_nodes)
print("Extracting Edge Fetures")
edge_features = einops.rearrange(graph_data['edge_attr'], "l e -> (l e)")
edge_features = preprocess_data(graph_data['edge_attr'], WINDOW_SIZE, SUBSAMPLE_FACTOR)
edge_features = einops.rearrange(edge_features, "(l e) f -> l e f", e=num_edges)
window_labels = graph_data['y'].unfold(0, WINDOW_SIZE, WINDOW_SIZE).max(dim=2).values[::SUBSAMPLE_FACTOR, :]
window_scenarios = graph_data['scenario'].unfold(0, WINDOW_SIZE, WINDOW_SIZE).mode(dim=2).values[::SUBSAMPLE_FACTOR, :]
print(node_features)
print(edge_features)
print(window_labels)
print(window_scenarios)

## Windowing

In [44]:
# Temporal parameters
WINDOW_SIZE = 12     
SUBSAMPLE_FACTOR = 1  # Reduced subsampling for more windows
num_nodes = graph_data['node_features'].shape[1]

def preprocess_data(features, window_size, subsample):
    """Process features into [windows, nodes] formatn"""
    # Slice data into overlapping windows
    features = features.unfold(0, window_size, window_size).float()
    features = einops.rearrange(features, "l n w -> l (n w)")
    
    # Subsample the windows
    return features[::subsample, :]

In [45]:
# Process features and labels
node_features = preprocess_data(graph_data['node_features'], WINDOW_SIZE, SUBSAMPLE_FACTOR)
edge_features = preprocess_data(graph_data['edge_attr'], WINDOW_SIZE, SUBSAMPLE_FACTOR)
window_labels = graph_data['y'].unfold(0, WINDOW_SIZE, WINDOW_SIZE).max(dim=2).values[::SUBSAMPLE_FACTOR, :]
window_scenarios = graph_data['scenario'].unfold(0, WINDOW_SIZE, WINDOW_SIZE).mode(dim=2).values[::SUBSAMPLE_FACTOR, :]
print(node_features)
print(edge_features)
print(window_labels)
print(window_scenarios)

tensor[292000, 384] n=112128000 (0.4Gb) x∈[0., 69.985] μ=61.388 σ=11.861
tensor[292000, 816] n=238272000 (0.9Gb) x∈[-3.694e+03, 1.338e+04] μ=835.184 σ=1.508e+03
tensor[292000, 32] i64 n=9344000 (71Mb) x∈[0, 1] μ=0.008 σ=0.088
tensor[292000, 1] i32 1.1Mb x∈[1, 1000] μ=500.502 σ=288.675


In [43]:
torch.save({
    'node_features': node_features,
    'window_labels': window_labels,
    "edge_features": edge_features,
    'window_scenarios': window_scenarios,
    "edge_index": graph_data['edge_index'].long().t().contiguous(),
},
    "data/processed_windowed_data_W12_S1.pt"           
)