In [1]:
from ThreeWToolkit.dataset import ParquetDataset
from ThreeWToolkit.core.base_dataset import ParquetDatasetConfig, EventPrefixEnum

## Download from figshare if not on local yet

If the dataset is not found at the given `path`, it will be downloaded automatically.

If you use `force_download=True`, the dataset will be downloaded regardless of whether it already exists locally. If it exists locally, it will be replaced by the new one. `force_download` defaults to `False`.


It may take a while to download from figshare (1.79GB) takes about 2-3 minutes to download.

Only dataset v2.0.0 is supported.

In [2]:
# Modify this path to the folder where your dataset is downloaded
dataset_path = "../../dataset"

In [3]:
# Load all files, selecting two classes (0 and 4)
ds_config = ParquetDatasetConfig(path=dataset_path, target_class=[0, 4])  
ds = ParquetDataset(ds_config)

[ParquetDataset] Dataset found at ../../dataset
[ParquetDataset] Validating dataset integrity...
[ParquetDataset] Dataset integrity check passed!


Get the number of events:

In [4]:
len(ds)

937

## Signal and label split

Our dataset object is a list of dictionaries.

Let's see the keys of the first element (dictionary).

In [5]:
ds[0].keys()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["class"].ffill(inplace=True) # forward fill of gaps in annotations


dict_keys(['signal', 'label', 'file_name'])

What is in the "signal" key?

In [6]:
ds[0]["signal"]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["class"].ffill(inplace=True) # forward fill of gaps in annotations


Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,P-JUS-CKGL,P-JUS-CKP,P-MON-CKP,P-PDG,P-TPT,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-06-24 23:00:38,-0.635461,-0.508689,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.099144,0.0,2.501570,0.0,1.150266,0.0,-0.209870,0.0,0.0,0.301153
2017-06-24 23:00:39,-0.635461,-0.508689,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.099115,0.0,2.502288,0.0,1.150461,0.0,-0.209740,0.0,0.0,0.301170
2017-06-24 23:00:40,-0.635461,-0.508689,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.099085,0.0,2.503003,0.0,1.150266,0.0,-0.209611,0.0,0.0,0.301022
2017-06-24 23:00:41,-0.635461,-0.508690,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.099056,0.0,2.503718,0.0,1.150071,0.0,-0.209481,0.0,0.0,0.300875
2017-06-24 23:00:42,-0.635461,-0.508690,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.099026,0.0,2.504433,0.0,1.150266,0.0,-0.209352,0.0,0.0,0.300924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-06-25 01:59:56,-0.635461,-0.510721,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.184848,0.0,2.685734,0.0,1.161625,0.0,-0.103950,0.0,0.0,0.302496
2017-06-25 01:59:57,-0.635461,-0.510721,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.184848,0.0,2.685066,0.0,1.161664,0.0,-0.103325,0.0,0.0,0.302644
2017-06-25 01:59:58,-0.635461,-0.510720,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.184848,0.0,2.684399,0.0,1.161703,0.0,-0.102701,0.0,0.0,0.302791
2017-06-25 01:59:59,-0.635461,-0.510720,-1.152179,0.414652,-0.681653,-0.094347,0.0,0.312558,0.650525,1.775691,...,-1.184848,0.0,2.683731,0.0,1.161742,0.0,-0.102077,0.0,0.0,0.302938


Let's see the ids of the classes that we have in our loaded dataset:

In [None]:
set_of_labels = set()
for file in ds:
    set_of_labels = set_of_labels.union(file["label"]["class"].unique())
set_of_labels


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["class"].ffill(inplace=True) # forward fill of gaps in annotations
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["class"].ffill(inplace=True) # forward fill of gaps in annotations
The behavior will change in pandas 3.0. This inplace method will never work because the 

## Customizable label selection

Let's reload the dataset ommiting the target column:

In [None]:
ds_config = ParquetDatasetConfig(path=dataset_path, target_column=None)  
ds = ParquetDataset(ds_config)

We expect not to have the "label" key.

In [None]:
ds[0].keys()

In [None]:
ds[0]["signal"]  # Should contain 'class' column

-------

## Event type splitting

Defining 2 types of events: drawn and simulated

In [None]:
event_types = [EventPrefixEnum.DRAWN, EventPrefixEnum.SIMULATED]

In [None]:
ds_config = ParquetDatasetConfig(path=dataset_path, event_type=event_types)
ds = ParquetDataset(ds_config)
print(f"Filtering events of types {event_types}, there are {len(ds)} events")

Only real events now:

In [None]:
event_types = [EventPrefixEnum.REAL]
ds_config = ParquetDatasetConfig(path=dataset_path, event_type=event_types)
ds = ParquetDataset(ds_config)
print(f"Filtering events of types {event_types}, there are {len(ds)} events")

------

## Event class splitting

We can select any combination of class events.

First, let's see how many events the whole dataset has.

In [None]:
# If no target class is provided (equivalent to `target_class=None`), all classes are loaded
ds_config = ParquetDatasetConfig(path=dataset_path)
ds = ParquetDataset(ds_config)
len(ds)

Now, let's select only events of one class:

In [None]:
# Only loading events from class 4
target_class = [4]
ds_config = ParquetDatasetConfig(path=dataset_path, target_class=target_class)
ds = ParquetDataset(ds_config)
len(ds)

Of 2 classes:

In [None]:
target_class = [0, 4]
ds_config = ParquetDatasetConfig(path=dataset_path, target_class=target_class)
ds = ParquetDataset(ds_config)
len(ds)

-------

## File list splitting

We can use a list to select loaded events. This is useful for customized train/val/test splitting.

In [None]:
split = ["6/SIMULATED_00012.parquet", "6/SIMULATED_00049.parquet", "0/WELL-00001_20170201010207.parquet", "4/WELL-00001_20170316110203.parquet"]

In [None]:
# To get only files in split, we need to set split="list"
ds_config = ParquetDatasetConfig(path=dataset_path, split="list", file_list=split)
ds = ParquetDataset(ds_config)
len(ds)

In [None]:
ds[0]["signal"]

In [None]:
for event in ds:
    print(event["label"]["class"].unique())