In [1]:
import os
import numpy as np
import pandas as pd

### View sensor graph information

In [2]:
local_path = "./sensor_graph"
os.listdir(local_path)

['adj_mx.pkl',
 'adj_mx_bay.pkl',
 'distances_bay_2017.csv',
 'distances_la_2012.csv',
 'graph_sensor_ids.txt',
 'graph_sensor_locations.csv',
 'graph_sensor_locations_bay.csv']

In [3]:
csv_file = "distances_la_2012.csv"
csv_path = os.path.join(local_path, csv_file)
distance_df = pd.read_csv(csv_path, dtype={'from': 'str', 'to': 'str'})
distance_df.head(10)

Unnamed: 0,from,to,cost
0,1201054,1201054,0.0
1,1201054,1201066,2610.9
2,1201054,1201076,2822.7
3,1201054,1201087,2911.5
4,1201054,1201100,7160.1
5,1201054,1201112,2656.5
6,1201054,1201118,3417.2
7,1201054,1201125,3659.9
8,1201054,1201145,6168.4
9,1201054,1201159,2651.8


In [4]:
# Check NaN value in distance
distance_df['cost'].isnull().sum()

0

In [5]:
sensor_id_file = 'graph_sensor_ids.txt'
sensor_id_path = os.path.join(local_path, sensor_id_file)

# Read sensor ids text file
with open(sensor_id_path) as f:
    sensor_ids = f.read().strip().split(',')

# Count the number of sensors
num_sensors = len(sensor_ids) # 207 sensors

# Create the distance matrix
dist_mat = np.zeros((num_sensors, num_sensors), dtype=np.float32)
dist_mat[:] = np.inf

# Build sensor id to index map
sensor_id_to_idx = {}
for i, sensor_id in enumerate(sensor_ids):
    sensor_id_to_idx[sensor_id] = i

# Fill cells in the matrix with distances
for row in distance_df.values:
    if row[0] not in sensor_id_to_idx or row[1] not in sensor_id_to_idx:
        continue
    dist_mat[sensor_id_to_idx[row[0]], sensor_id_to_idx[row[1]]] = row[2]

# Calculate the standard deviation as theta
distances = dist_mat[~np.isinf(dist_mat)].flatten()
std = distances.std()

# Get adjacency matrix
adj_mat = np.exp(-np.square(dist_mat / std))

# Set normalized values
normalized_k = 0.1
adj_mat[adj_mat < normalized_k] = 0
adj_mat

array([[1.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 1.       , 0.3909554, ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.7174379, 1.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 1.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 1.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        1.       ]], dtype=float32)

### View sequence data METR-LA

In [7]:
local_path = "./"
os.listdir(local_path)

['gen_adj_mx.py',
 'glimsp_data.ipynb',
 'METR-LA',
 'metr-la.h5',
 'PEMS-BAY',
 'pems-bay.h5',
 'sensor_graph']

In [9]:
input_data_file = "metr-la.h5"
input_data_path = os.path.join(local_path, input_data_file)

# Read METR-LA data from 2012-03-01 00:00:00 - 2012-06-27 23:55:00
df = pd.read_hdf(input_data_path)
df.head()

Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
2012-03-01 00:00:00,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,...,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
2012-03-01 00:05:00,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,...,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2012-03-01 00:10:00,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,...,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
2012-03-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-03-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
num_samples, num_nodes = df.shape
num_samples, num_nodes

(34272, 207)

In [34]:
data = np.expand_dims(df.values, axis=-1)
feature_list = [data]
time_idx = (df.index.values - df.index.values.astype("datetime64[D]")) / np.timedelta64(1, "D")

In [40]:
time_idx.shape, data.shape

((34272,), (34272, 207, 1))

In [41]:
time_in_day = np.tile(time_idx, [1, num_nodes, 1]).transpose((2, 1, 0))
feature_list.append(time_in_day)

In [43]:
data = np.concatenate(feature_list, axis=-1)
data.shape

(34272, 207, 2)

In [49]:
seq_length_x, seq_length_y = 12, 12
x_offsets = np.sort(np.concatenate((np.arange(-(seq_length_x - 1), 1, 1), )))
y_offsets = np.sort(np.arange(1, (seq_length_y + 1), 1))

x, y = [], []
min_t = abs(min(x_offsets))
max_t = abs(num_samples - abs(max(y_offsets))) # Exclusive
for t in range(min_t, max_t):
    x.append(data[t + x_offsets, ...])
    y.append(data[t + y_offsets, ...])
x = np.stack(x, axis=0)
y = np.stack(y, axis=0)

In [51]:
x.shape, y.shape

((34249, 12, 207, 2), (34249, 12, 207, 2))

In [52]:
num_samples = x.shape[0]
num_test = round(num_samples * 0.2)
num_train = round(num_samples * 0.7)
num_val = num_samples - num_train - num_test
x_train, y_train = x[:num_train], y[:num_train]
x_val, y_val     = (x[num_train: num_train+num_val], y[num_train: num_train+num_val])
x_test, y_test   = x[-num_test:], y[-num_test:]

x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape

((23974, 12, 207, 2),
 (23974, 12, 207, 2),
 (3425, 12, 207, 2),
 (3425, 12, 207, 2),
 (6850, 12, 207, 2),
 (6850, 12, 207, 2))