In [1]:
#  Copyright 2022 Institute of Advanced Research in Artificial Intelligence (IARAI) GmbH.
#  IARAI licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License. You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

In [2]:
import os
import sys

In [3]:
# Alternatevly, in order to make the module imports work properly set PYTHONPATH=$PWD before launching the notebook server from the repo root folder.
sys.path.insert(0, os.path.abspath("../"))  # noqa:E402

![t4c20logo](../t4c20logo.png)

In [4]:
import re

import pyarrow as pa
import pyarrow.parquet as pq
from IPython.core.display import HTML
from IPython.display import display
from pyarrow.parquet import ParquetFile

import t4c22
from t4c22.t4c22_config import load_basedir

In [5]:
%matplotlib inline
%load_ext autoreload
%load_ext time
%autoreload 2
%autosave 60

display(HTML("<style>.container { width:80% !important; }</style>"))

The time module is not an IPython extension.


Autosaving every 60 seconds


In [6]:
# load BASEDIR from file, change to your data root
BASEDIR = load_basedir(fn="t4c22_config.json", pkg=t4c22)

## Data Inventory for Data Specification

In [7]:
def print_inventory(fn):
    print("")
    print("")
    print(f"## `{fn.relative_to(BASEDIR)}`")
    print("")
    schema = pq.read_schema(fn, memory_map=True)
    schema = [(name, str(pa_dtype)) for name, pa_dtype in zip(schema.names, schema.types)]

    pf = ParquetFile(fn)
    rows = next(pf.iter_batches(batch_size=1))
    df = pa.Table.from_batches([rows]).to_pandas()
    first_row = df.iloc[0]

    print("| Attribute     | Example      | Data Type | Description |")
    print("|---------------|--------------|-----------|-------------|")
    for k, v in schema:
        if k.startswith("__"):
            continue
        print(f"| {k} | {first_row[k]} | {v} |    |")

In [8]:
for fn in BASEDIR.rglob("*.parquet"):
    if "london" not in str(fn):
        continue
    if re.search("[0-9]{4}-[0-9]{2}-[0-9]{2}", str(fn)):
        #         print(str(fn))
        if "2019-07-01" not in str(fn):
            continue
    print_inventory(fn)



## `train/london/input/counters_2019-07-01.parquet`

| Attribute     | Example      | Data Type | Description |
|---------------|--------------|-----------|-------------|
| node_id | 101818 | int64 |    |
| day | 2019-07-01 | string |    |
| t | 4 | int64 |    |
| volumes_1h | [nan nan nan nan] | list<item: double> |    |


## `train/london/labels/cc_labels_2019-07-01.parquet`

| Attribute     | Example      | Data Type | Description |
|---------------|--------------|-----------|-------------|
| u | 78112 | int64 |    |
| v | 25508583 | int64 |    |
| day | 2019-07-01 | string |    |
| t | 9 | int64 |    |
| cc | 2 | int64 |    |


## `road_graph/london/cell_mapping.parquet`

| Attribute     | Example      | Data Type | Description |
|---------------|--------------|-----------|-------------|
| u | 78112 | int64 |    |
| v | 25508583 | int64 |    |
| cells | [(172, 223, 2, 0.0), (173, 223, 2, 1.0), (173, 223, 3, 1.0), (172, 223, 3, 0.0)] | string |    |


## `road_graph/london/road_gr