In [11]:
import torch
import os
import einops
import torch.nn.functional as F
import numpy as np
from torch_geometric.data import Data
from tqdm import tqdm
import pandas as pd

import lovely_tensors as lt
lt.monkey_patch()

In [3]:
dataframe = pd.read_parquet('data/data_replicated_updated_5.parquet', engine='pyarrow')
dataframe

Unnamed: 0,Index,Timestamp,scenario,PNode_2,PNode_9,PNode_10,PNode_11,PNode_12,PNode_13,PNode_21,...,dNode_10,dNode_11,dNode_12,dNode_13,dNode_21,dNode_22,dNode_23,dNode_31,dNode_32,Label
0,1,1/1/2017 0:00,1,36.576,0.0,92.944,89.658,82.317,83.756,84.905,...,0.0,18.0,21.6,14.4,18.0,28.8,18.0,18.0,10.8,0.0
1,2,1/1/2017 0:30,1,37.265,0.0,93.525,90.288,83.008,84.478,85.796,...,0.0,18.0,18.0,10.8,18.0,28.8,14.4,14.4,10.8,0.0
2,3,1/1/2017 1:00,1,37.985,0.0,94.188,91.007,83.731,85.228,86.611,...,0.0,14.4,14.4,10.8,18.0,25.2,14.4,14.4,10.8,0.0
3,4,1/1/2017 1:30,1,38.741,0.0,94.841,91.715,84.487,85.989,87.385,...,0.0,10.8,14.4,10.8,14.4,25.2,10.8,14.4,7.2,0.0
4,5,1/1/2017 2:00,1,39.505,0.0,95.488,92.417,85.252,86.766,88.313,...,0.0,10.8,10.8,7.2,14.4,21.6,10.8,10.8,7.2,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17519995,17516,12/31/2017 21:30,1000,45.720,0.0,114.965,111.537,113.573,114.675,112.573,...,0.0,32.4,36.0,25.2,32.4,46.8,36.0,25.2,25.2,0.0
17519996,17517,12/31/2017 22:00,1000,45.720,0.0,116.194,113.044,115.184,116.285,114.204,...,0.0,32.4,28.8,25.2,28.8,46.8,32.4,25.2,25.2,0.0
17519997,17518,12/31/2017 22:30,1000,45.720,0.0,118.481,115.855,118.146,119.333,117.406,...,0.0,28.8,28.8,21.6,25.2,39.6,28.8,21.6,21.6,0.0
17519998,17519,12/31/2017 23:00,1000,45.720,0.0,118.980,116.469,118.782,119.996,118.032,...,0.0,28.8,28.8,21.6,25.2,43.2,25.2,21.6,18.0,0.0


In [40]:
df_node_features = dataframe[filter(lambda x: x.startswith("PNode"), dataframe.columns)]
print(df_node_features)
node_features = torch.tensor(df_node_features.to_numpy())
print(node_features)

          PNode_2  PNode_9  PNode_10  PNode_11  PNode_12  PNode_13  PNode_21  \
0          36.576      0.0    92.944    89.658    82.317    83.756    84.905   
1          37.265      0.0    93.525    90.288    83.008    84.478    85.796   
2          37.985      0.0    94.188    91.007    83.731    85.228    86.611   
3          38.741      0.0    94.841    91.715    84.487    85.989    87.385   
4          39.505      0.0    95.488    92.417    85.252    86.766    88.313   
...           ...      ...       ...       ...       ...       ...       ...   
17519995   45.720      0.0   114.965   111.537   113.573   114.675   112.573   
17519996   45.720      0.0   116.194   113.044   115.184   116.285   114.204   
17519997   45.720      0.0   118.481   115.855   118.146   119.333   117.406   
17519998   45.720      0.0   118.980   116.469   118.782   119.996   118.032   
17519999   45.720      0.0   121.005   118.966   121.407   122.657   120.800   

          PNode_22  PNode_23  PNode_31 

In [46]:
df_node_features.columns
node_map = dict(zip([int(x.split("_")[1]) for x in df_node_features.columns], range(len(df_node_features.columns))))

In [47]:
node_map

{2: 0, 9: 1, 10: 2, 11: 3, 12: 4, 13: 5, 21: 6, 22: 7, 23: 8, 31: 9, 32: 10}

In [80]:
scenarios = torch.tensor(dataframe[["scenario"]].to_numpy())

In [81]:
NUM_SCENARIOS = len(scenarios.unique())
NUM_NODES = node_features.shape[1]
NUM_TIMESAMPS = node_features.shape[0] // NUM_SCENARIOS

print(NUM_SCENARIOS, NUM_NODES, NUM_TIMESAMPS)

1000 11 17520


In [82]:
data_path = "data/Net1_CMH"

folders = os.listdir(data_path)
leaks = [
    os.path.join(data_path, folder, "Leaks")
    for folder in folders if "Scenario" in folder
]

In [83]:
y = torch.zeros(NUM_SCENARIOS, NUM_NODES, NUM_TIMESAMPS)

for leak in tqdm(leaks):
    if len(os.listdir(leak)) == 0:
        continue
    scenario = int(os.path.split(leak)[0].split("-")[-1]) - 1
    demands = [os.path.join(leak, demand) for demand in os.listdir(leak) if "demand" in demand]
    for demand in demands:
        node = int(os.path.split(demand)[1].split("_")[1])
        node = node_map[node]
        y_node_data = pd.read_csv(demand)
        y_node_data = torch.tensor(y_node_data["Value"].to_numpy()) > 0.0
        y[scenario, node, :] = y_node_data
    
print(y)
y = einops.rearrange(y, "s n t -> (s t) n")
print(y)

  5%|▍         | 48/1000 [00:00<00:02, 472.15it/s]

100%|██████████| 1000/1000 [00:01<00:00, 539.83it/s]


tensor[1000, 11, 17520] n=192720000 (0.7Gb) x∈[0., 1.000] μ=0.014 σ=0.115
tensor[17520000, 11] n=192720000 (0.7Gb) x∈[0., 1.000] μ=0.014 σ=0.115


In [84]:
edge_index = pd.read_csv("data/Net1_CMH.csv")
edge_index = edge_index.map(lambda x: node_map[int(x)])
edge_index = torch.tensor(edge_index[["Node1", "Node2"]].to_numpy())
edge_index.p

tensor([[ 2,  3],
        [ 3,  4],
        [ 4,  5],
        [ 6,  7],
        [ 7,  8],
        [ 9, 10],
        [ 0,  4],
        [ 3,  6],
        [ 4,  7],
        [ 5,  8],
        [ 6,  9],
        [ 7, 10]])

In [89]:
torch_dataset = {
    "scenario": scenarios,
    "node_features": node_features,
    "edge_index": edge_index,
    "y": y
}

In [90]:
torch_dataset

{'scenario': tensor[17520000, 1] i64 0.1Gb x∈[1, 1000] μ=500.500 σ=288.675,
 'node_features': tensor[17520000, 11] f64 n=192720000 (1.4Gb) x∈[-2.779, 133.341] μ=88.706 σ=40.939,
 'edge_index': tensor[12, 2] i64 n=24 x∈[0, 10] μ=5.708 σ=2.612,
 'y': tensor[17520000, 11] n=192720000 (0.7Gb) x∈[0., 1.000] μ=0.014 σ=0.115}

In [91]:
torch.save(torch_dataset, "data/graph_olddata_torch.pt")