In [1]:
from pathlib import Path
from ThreeWToolkit.dataset import ParquetDataset, DatasetConfig
from ThreeWToolkit.core.base_dataset import EventPrefixEnum

## Download from figshare if not on local yet.
Will raise if `download=True` but file exists.

May take a while to download from figshare.

For now, only v2.0.0 is implemented, but we check for correctness of other versions as well.

In [4]:
dataset_path = Path("./dataset")
ds_config = DatasetConfig(path=dataset_path, split=None, file_type="parquet") # load all files, target defaults to 'class'
ds = ParquetDataset(ds_config, download=True) # download

3w_dataset_2.0.0.zip: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.79G/1.79G [38:09<00:00, 784kB/s]


## Get number of events

In [4]:
# number of loaded events
len(ds)

2228

## Signal and label split
Output format is a dictionary.

Metadata may be added to dict as needed.

In [36]:
# dictionary keys:
print(list(ds[220].keys()))

['signal', 'label']


In [37]:
ds[0]['signal']

Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,P-PDG,PT-P,P-TPT,QBS,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT,state
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-24 23:00:38,0.0,27.90459,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20752050.0,,,65.26942,,,106.0993,
2017-06-24 23:00:39,0.0,27.90458,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20753200.0,,,65.27189,,,106.0998,
2017-06-24 23:00:40,0.0,27.90458,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20752050.0,,,65.27435,,,106.0953,
2017-06-24 23:00:41,0.0,27.90457,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20750900.0,,,65.27682,,,106.0908,
2017-06-24 23:00:42,0.0,27.90456,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20752050.0,,,65.27928,,,106.0923,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-06-25 01:59:56,0.0,27.84673,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20818930.0,,,67.28551,,,106.1403,0
2017-06-25 01:59:57,0.0,27.84673,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20819160.0,,,67.29739,,,106.1448,0
2017-06-25 01:59:58,0.0,27.84674,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20819390.0,,,67.30928,,,106.1493,0
2017-06-25 01:59:59,0.0,27.84674,0.0,1.0,0.0,0.0,,1.0,1.0,1.0,...,,,20819620.0,,,67.32116,,,106.1538,0


In [38]:
ds[0]['label']

Unnamed: 0_level_0,class
timestamp,Unnamed: 1_level_1
2017-06-24 23:00:38,
2017-06-24 23:00:39,
2017-06-24 23:00:40,
2017-06-24 23:00:41,
2017-06-24 23:00:42,
...,...
2017-06-25 01:59:56,4
2017-06-25 01:59:57,4
2017-06-25 01:59:58,4
2017-06-25 01:59:59,4


## Customizable label selection
May, for instance, unselect any label

In [39]:
ds_config = DatasetConfig(path=dataset_path, split=None, file_type="parquet", target_column=None) # unset target column
ds = ParquetDataset(ds_config, download=False) # no need to download, will error if file already exists

In [40]:
# check content
len(ds), list(ds[0].keys()) # should have no 'label'

(2228, ['signal'])

In [42]:
ds[0]['signal'] # Should contain 'class' column

Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,PT-P,P-TPT,QBS,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT,class,state
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-26 00:20:00,,,,,,,,,,,...,,1.988112e+07,,,84.083661,86.093931,,122.925200,0,0
2018-04-26 00:20:01,,,,,,,,,,,...,,1.988117e+07,,,84.083650,86.093920,,122.925200,0,0
2018-04-26 00:20:02,,,,,,,,,,,...,,1.988122e+07,,,84.083640,86.093901,,122.925200,0,0
2018-04-26 00:20:03,,,,,,,,,,,...,,1.988127e+07,,,84.083620,86.093890,,122.925299,0,0
2018-04-26 00:20:04,,,,,,,,,,,...,,1.988132e+07,,,84.083611,86.093872,,122.925300,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-04-26 08:28:14,,,,,,,,,,,...,,1.780136e+07,,,82.477910,84.143450,,122.319786,5,0
2018-04-26 08:28:15,,,,,,,,,,,...,,1.780115e+07,,,82.477910,84.143450,,122.319610,5,0
2018-04-26 08:28:16,,,,,,,,,,,...,,1.780122e+07,,,82.477910,84.143450,,122.319600,5,0
2018-04-26 08:28:17,,,,,,,,,,,...,,1.780135e+07,,,82.477910,84.143450,,122.319798,5,0


## Event type splitting
Can select any combination of drawn, simulated and real events

In [43]:
event_types = [EventPrefixEnum.DRAWN, EventPrefixEnum.SIMULATED]
ds_config = DatasetConfig(path=dataset_path, split=None, file_type="parquet", event_type=event_types) # get only drawn
ds = ParquetDataset(ds_config, download=False)
len(ds) # should be much smaller /

1109

In [44]:
event_types = [EventPrefixEnum.REAL,]
ds_config = DatasetConfig(path=dataset_path, split=None, file_type="parquet", event_type=event_types) # get only drawn
ds = ParquetDataset(ds_config, download=False)
len(ds) # should be much smaller /

1119

## Event class splitting
Can select any combination of class events sourced from class {0, 1, 2, ...}

In [45]:
target_class = [0]
ds_config = DatasetConfig(path=dataset_path, split=None, file_type="parquet", target_class=target_class) # get only drawn
ds = ParquetDataset(ds_config, download=False)
len(ds)

594

In [46]:
target_class = [4]
ds_config = DatasetConfig(path=dataset_path, split=None, file_type="parquet", target_class=target_class) # get only drawn
ds = ParquetDataset(ds_config, download=False)
len(ds)

343

In [47]:
target_class = [0, 4]
ds_config = DatasetConfig(path=dataset_path, split=None, file_type="parquet", target_class=target_class) # get only drawn
ds = ParquetDataset(ds_config, download=False)
len(ds)

937

## File list splitting
May use a list to select loaded events
Useful for customized train/val/test splitting

In [49]:
with open("./dataset/my_split.txt", "r") as f:
    split = [line.strip() for line in f]
split, len(split)

(['./0/WELL-00008_20170817140222.parquet',
  './3/SIMULATED_00061.parquet',
  './4/WELL-00004_20140806090103.parquet',
  './6/SIMULATED_00117.parquet',
  './0/WELL-00001_20170201110124.parquet',
  './5/SIMULATED_00138.parquet',
  './4/WELL-00005_20170624070158.parquet',
  './8/SIMULATED_00044.parquet',
  './5/SIMULATED_00303.parquet',
  './9/SIMULATED_00028.parquet',
  './8/SIMULATED_00072.parquet',
  './7/WELL-00022_20180802233838.parquet',
  './0/WELL-00003_20170812110000.parquet',
  './9/SIMULATED_00115.parquet',
  './1/SIMULATED_00025.parquet',
  './9/SIMULATED_00065.parquet',
  './6/SIMULATED_00041.parquet',
  './5/SIMULATED_00329.parquet',
  './4/WELL-00004_20141118160016.parquet',
  './6/SIMULATED_00095.parquet'],
 20)

In [52]:
ds_config = DatasetConfig(path=dataset_path, split="list", file_type="parquet", file_list=split) # get only files in split
ds = ParquetDataset(ds_config)
len(ds)

20

In [53]:
ds[0]['signal']

Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,P-PDG,PT-P,P-TPT,QBS,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT,state
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-17 14:02:22,,,,,,,,,,,...,,,,,,,,,,
2017-08-17 14:02:23,,,,,,,,,,,...,,,,,,,,,,
2017-08-17 14:02:24,,,,,,,,,,,...,,,,,,,,,,
2017-08-17 14:02:25,,,,,,,,,,,...,,,,,,,,,,
2017-08-17 14:02:26,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-17 19:57:09,,,,,,,,,,,...,,,,,,,,,,0
2017-08-17 19:57:10,,,,,,,,,,,...,,,,,,,,,,0
2017-08-17 19:57:11,,,,,,,,,,,...,,,,,,,,,,0
2017-08-17 19:57:12,,,,,,,,,,,...,,,,,,,,,,0


In [54]:
ds[0]['label']

Unnamed: 0_level_0,class
timestamp,Unnamed: 1_level_1
2017-08-17 14:02:22,
2017-08-17 14:02:23,
2017-08-17 14:02:24,
2017-08-17 14:02:25,
2017-08-17 14:02:26,
...,...
2017-08-17 19:57:09,0
2017-08-17 19:57:10,0
2017-08-17 19:57:11,0
2017-08-17 19:57:12,0
