In [1]:
from pathlib import Path
from ThreeWToolkit.dataset import ParquetDataset
from ThreeWToolkit.core.base_dataset import ParquetDatasetConfig, EventPrefixEnum

## Download from figshare if not on local yet

If the dataset is not found at the given `path`, it will be downloaded automatically.

If you use `force_download=True`, the dataset will be downloaded regardless of whether it already exists locally. If it exists locally, it will be replaced by the new one. `force_download` defaults to `False`.


It may take a while to download from figshare (1.79GB) takes about 2-3 minutes to download.

Only dataset v2.0.0 is supported.

In [3]:
dataset_path = Path("../../dataset")
ds_config = ParquetDatasetConfig(
    path=dataset_path, split=None, target_class=[0, 4]
)  # load all files, target defaults to 'class'
ds = ParquetDataset(ds_config)

[ParquetDataset] Dataset found at ../../dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


## Get number of events

In [3]:
# number of loaded events
len(ds)

937

## Signal and label split
Output format is a dictionary.

Metadata may be added to dict as needed.

In [4]:
# dictionary keys:
print(list(ds[220].keys()))

['signal', 'label', 'file_name']


In [None]:
ds[0]["signal"]

Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,P-JUS-CKGL,P-JUS-CKP,P-MON-CKP,P-PDG,P-TPT,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-03-16 11:02:03,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062410,0.0,0.901666,0.0,0.022043,-0.846093,-0.016113,0.0,0.0,0.662663
2017-03-16 11:02:04,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062410,0.0,0.901959,0.0,0.022101,-0.846093,-0.015950,0.0,0.0,0.662680
2017-03-16 11:02:05,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062410,0.0,0.902253,0.0,0.022159,-0.846093,-0.015788,0.0,0.0,0.662696
2017-03-16 11:02:06,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062410,0.0,0.902546,0.0,0.022216,-0.846093,-0.015626,0.0,0.0,0.662716
2017-03-16 11:02:07,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062410,0.0,0.902840,0.0,0.022274,-0.846093,-0.015463,0.0,0.0,0.662732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-03-16 13:59:56,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062633,0.0,0.794132,0.0,-0.013949,-0.846093,-0.007269,0.0,0.0,0.659817
2017-03-16 13:59:57,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062633,0.0,0.793818,0.0,-0.014005,-0.846093,-0.007315,0.0,0.0,0.659824
2017-03-16 13:59:58,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062633,0.0,0.793504,0.0,-0.014061,-0.846093,-0.007360,0.0,0.0,0.659833
2017-03-16 13:59:59,0.0,0.0,0.867921,0.414652,-0.681653,-0.094347,-1.094009,0.312558,0.650525,-0.563169,...,-1.062633,0.0,0.793190,0.0,-0.014117,-0.846093,-0.007406,0.0,0.0,0.659840


In [None]:
ds[0]["label"]

Unnamed: 0_level_0,class
timestamp,Unnamed: 1_level_1
2017-03-16 11:02:03,4
2017-03-16 11:02:04,4
2017-03-16 11:02:05,4
2017-03-16 11:02:06,4
2017-03-16 11:02:07,4
...,...
2017-03-16 13:59:56,4
2017-03-16 13:59:57,4
2017-03-16 13:59:58,4
2017-03-16 13:59:59,4


## Customizable label selection
May, for instance, unselect any label

In [None]:
ds_config = ParquetDatasetConfig(
    path=dataset_path, target_column=None
)  # unset target column
ds = ParquetDataset(ds_config)

[ParquetDataset] Dataset found at /data/3w_dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


In [None]:
# check content
len(ds), list(ds[0].keys())  # should have no 'label'

(2228, ['signal', 'file_name'])

In [None]:
ds[0]["signal"]  # Should contain 'class' column

Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,P-JUS-CKP,P-MON-CKP,P-PDG,P-TPT,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-09-05 20:44:36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062317,-2.703362,-2.374172,0.0,-0.131847,0.0,0.0,0.589097,0.0
2018-09-05 20:44:37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062317,-2.703362,-2.374172,0.0,-0.131847,0.0,0.0,0.589097,0.0
2018-09-05 20:44:38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062317,-2.703362,-2.374172,0.0,-0.131847,0.0,0.0,0.589097,0.0
2018-09-05 20:44:39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062317,-2.703362,-2.374172,0.0,-0.131847,0.0,0.0,0.589097,0.0
2018-09-05 20:44:40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062317,-2.703362,-2.374172,0.0,-0.131847,0.0,0.0,0.589097,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-09-07 20:44:31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062344,-2.703361,-2.374192,0.0,-1.996049,0.0,0.0,-2.900260,0.0
2018-09-07 20:44:32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062344,-2.703361,-2.374192,0.0,-1.996049,0.0,0.0,-2.900260,0.0
2018-09-07 20:44:33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062344,-2.703361,-2.374192,0.0,-1.996049,0.0,0.0,-2.900260,0.0
2018-09-07 20:44:34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.062344,-2.703361,-2.374192,0.0,-1.996049,0.0,0.0,-2.900260,0.0


## Event type splitting
Can select any combination of drawn, simulated and real events

In [10]:
event_types = [EventPrefixEnum.DRAWN, EventPrefixEnum.SIMULATED]

In [None]:
ds_config = ParquetDatasetConfig(path=dataset_path, event_type=event_types)
ds = ParquetDataset(ds_config)
len(ds)  # should be much smaller

[ParquetDataset] Dataset found at /data/3w_dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


1109

In [None]:
event_types = [EventPrefixEnum.REAL]
ds_config = ParquetDatasetConfig(path=dataset_path, event_type=event_types)
ds = ParquetDataset(ds_config)
len(ds)  # should be much smaller

[ParquetDataset] Dataset found at /data/3w_dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


1119

## Event class splitting
Can select any combination of class events sourced from class {0, 1, 2, ...}

In [13]:
# If no target class is provided (equivalent to `target_class=None`), all classes are loaded
ds_config = ParquetDatasetConfig(path=dataset_path)
ds = ParquetDataset(ds_config)
len(ds)

[ParquetDataset] Dataset found at /data/3w_dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


2228

In [14]:
# Only loading events from class 4
target_class = [4]
ds_config = ParquetDatasetConfig(path=dataset_path, target_class=target_class)
ds = ParquetDataset(ds_config)
len(ds)

[ParquetDataset] Dataset found at /data/3w_dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


343

In [15]:
target_class = [0, 4]
ds_config = ParquetDatasetConfig(path=dataset_path, target_class=target_class)
ds = ParquetDataset(ds_config)
len(ds)

[ParquetDataset] Dataset found at /data/3w_dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


937

## File list splitting
We can use a list to select loaded events.  

Useful for customized train/val/test splitting.

In [16]:
with open("/data/3w_split.txt", "r") as f:
    split = [line.strip() for line in f]
split, len(split)

(['0/WELL-00008_20170817140222.parquet',
  '3/SIMULATED_00061.parquet',
  '4/WELL-00004_20140806090103.parquet',
  '6/SIMULATED_00117.parquet',
  '0/WELL-00001_20170201110124.parquet',
  '5/SIMULATED_00138.parquet',
  '4/WELL-00005_20170624070158.parquet',
  '8/SIMULATED_00044.parquet',
  '5/SIMULATED_00303.parquet',
  '9/SIMULATED_00028.parquet',
  '8/SIMULATED_00072.parquet',
  '7/WELL-00022_20180802233838.parquet',
  '0/WELL-00003_20170812110000.parquet',
  '9/SIMULATED_00115.parquet',
  '1/SIMULATED_00025.parquet',
  '9/SIMULATED_00065.parquet',
  '6/SIMULATED_00041.parquet',
  '5/SIMULATED_00329.parquet',
  '4/WELL-00004_20141118160016.parquet',
  '6/SIMULATED_00095.parquet'],
 20)

In [17]:
# To get only files in split, we need to set split="list"
ds_config = ParquetDatasetConfig(path=dataset_path, split="list", file_list=split)
ds = ParquetDataset(ds_config)
len(ds)

[ParquetDataset] Dataset found at /data/3w_dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


20

In [None]:
ds[0]["signal"]

Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,P-JUS-CKGL,P-JUS-CKP,P-MON-CKP,P-PDG,P-TPT,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-17 14:02:22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 14:02:23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 14:02:24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 14:02:25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 14:02:26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-17 19:57:09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 19:57:10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 19:57:11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 19:57:12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
ds[0]["label"]

Unnamed: 0_level_0,class
timestamp,Unnamed: 1_level_1
2017-08-17 14:02:22,0
2017-08-17 14:02:23,0
2017-08-17 14:02:24,0
2017-08-17 14:02:25,0
2017-08-17 14:02:26,0
...,...
2017-08-17 19:57:09,0
2017-08-17 19:57:10,0
2017-08-17 19:57:11,0
2017-08-17 19:57:12,0
