In [42]:
import pandas as pd
import h5py
import numpy as np
import os.path

In [43]:
#h5 file path
filename1 = '../data/metr-la.h5'
filename2 = '../data/pems-bay.h5'

#read h5 file
metrla_dataset = h5py.File(filename1, 'r')
pemsbay_dataset = h5py.File(filename2, 'r')

In [44]:
#print the first unknown key in the h5 file
print(metrla_dataset.keys()) #returns df

<KeysViewHDF5 ['df']>


In [45]:
#save the h5 file to csv using the first key df
with pd.HDFStore(filename1, 'r') as d:
    metrla_df = d.get('df')
    metrla_df.to_csv('metr-la.csv')

#### Description of METR-LA dataset
##### Url: https://towardsdatascience.com/build-your-first-graph-neural-network-model-to-predict-traffic-speed-in-20-minutes-b593f8f838e5

The file metr-la.h5 contains an array of shape [34272, 207], where 34272 is total number of time steps, and 207 is number of sensors. The array contains only speed data, meaning that the GNN model uses the historical speed to predict future speed. No other features (road type, weather, holidays) are involved. The speed was recorded every 5 mins with sensors. The 207 sensors are distributed on roads within the area. Speed was collected every 5 mins. So one day should have 24*(60/5)=288 records. So the data of one day is simply an array of shape [288, 207], where 288 is total time steps, and 207 is number of sensors. Since the data was collected across 4 months, there are a total number of 34272 time steps after optional data cleaning. Here below is the first 5 rows. The headers are ids of sensors and the values of content are speed.

In [46]:
# Load npz train file
metrla_train_npz = np.load(os.path.join('../data/METR-LA/train.npz'))

In [47]:
# obtain a list of all the constituent files
metrla_train_npz.files

['x', 'y', 'x_offsets', 'y_offsets']

In [48]:
# Access the arrays inside the npz file.
x_train = metrla_train_npz["x"]
y_train = metrla_train_npz["y"]
x_offsets = metrla_train_npz["x_offsets"]
y_offsets = metrla_train_npz["y_offsets"]

# Print the shape of the arrays.
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_offsets shape:", x_offsets.shape)
print("y_offsets shape:", y_offsets.shape)

# x: (num_samples, input_length, num_nodes, input_dim)
# y: (num_samples, output_length, num_nodes, output_dim)

x_train shape: (23974, 12, 207, 2)
y_train shape: (23974, 12, 207, 2)
x_offsets shape: (12, 1)
y_offsets shape: (12, 1)


In [49]:
x_train[0][0][:5]

array([[64.375,  0.   ],
       [67.625,  0.   ],
       [67.125,  0.   ],
       [61.5  ,  0.   ],
       [66.875,  0.   ]])

In [50]:
x_train[0][1][:5]

array([[6.26666667e+01, 3.47222222e-03],
       [6.85555556e+01, 3.47222222e-03],
       [6.54444444e+01, 3.47222222e-03],
       [6.24444444e+01, 3.47222222e-03],
       [6.44444444e+01, 3.47222222e-03]])

In [51]:
x_train[:1]

array([[[[6.43750000e+01, 0.00000000e+00],
         [6.76250000e+01, 0.00000000e+00],
         [6.71250000e+01, 0.00000000e+00],
         ...,
         [5.92500000e+01, 0.00000000e+00],
         [6.90000000e+01, 0.00000000e+00],
         [6.18750000e+01, 0.00000000e+00]],

        [[6.26666667e+01, 3.47222222e-03],
         [6.85555556e+01, 3.47222222e-03],
         [6.54444444e+01, 3.47222222e-03],
         ...,
         [5.58888889e+01, 3.47222222e-03],
         [6.84444444e+01, 3.47222222e-03],
         [6.28750000e+01, 3.47222222e-03]],

        [[6.40000000e+01, 6.94444444e-03],
         [6.37500000e+01, 6.94444444e-03],
         [6.00000000e+01, 6.94444444e-03],
         ...,
         [6.13750000e+01, 6.94444444e-03],
         [6.98571429e+01, 6.94444444e-03],
         [6.20000000e+01, 6.94444444e-03]],

        ...,

        [[6.35000000e+01, 3.12500000e-02],
         [6.15000000e+01, 3.12500000e-02],
         [6.25000000e+01, 3.12500000e-02],
         ...,
         [5.07500000e

In [52]:
y_offsets

array([[ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12]])

In [53]:
import numpy as np
x_offsets = np.sort(np.concatenate((np.arange(-(12 - 1), 1, 1),)))

In [54]:
x_offsets

array([-11, -10,  -9,  -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1,   0])

In [55]:
y = np.sort(np.arange(1, (12 + 1), 1))

In [56]:
y

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [57]:
import torch

x = torch.tensor([1, 2, 3])
print(x.shape)  # Output: torch.Size([3])

y = torch.unsqueeze(x, 0)
print(y.shape)  # Output: torch.Size([1, 3])

z = torch.unsqueeze(x, 1)
print(z.shape)  # Output: torch.Size([3, 1])

torch.Size([3])
torch.Size([1, 3])
torch.Size([3, 1])


In [58]:
p = torch.unsqueeze(x, -1)
print(p.shape)

torch.Size([3, 1])


In [59]:
p

tensor([[1],
        [2],
        [3]])

In [64]:
import sys
sys.path.append('../')
import util

In [65]:
sensor_ids, sensor_id_to_ind, adj_mx = util.load_adj('../data/sensor_graph/adj_mx.pkl', 'doubletransition')

In [82]:
len(adj_mx[0])

207

In [86]:
sensor_ids[:5]

['773869', '767541', '767542', '717447', '717446']