**Adaptation to recognize the project root. For demonstration purposes only.**

In [1]:
import sys
import os

# Adiciona o diretório raiz ao sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../')))

In [2]:
import numpy as np

from pathlib import Path
from ThreeWToolkit.dataset import ParquetDataset
from ThreeWToolkit.core.base_dataset import EventPrefixEnum, ParquetDatasetConfig

## Download from figshare if not on local yet.
Will raise if `download=True` but file exists.

May take a while to download from figshare.

For now, only v2.0.0 is implemented, but we check for correctness of other versions as well.

In [3]:
dataset_path = Path("../../data/raw")
ds_config = ParquetDatasetConfig(path=dataset_path, split=None, download=True, clean_data=False) # load all files, target defaults to 'class'
ds = ParquetDataset(ds_config)

[ParquetDataset] Found existing dataset at ..\..\data\raw\download.
[ParquetDataset] Dataset already extracted at ..\..\data\raw.


## Get number of events

In [4]:
# number of loaded events
len(ds)

2228

## Signal and label split
Output format is a dictionary.

Metadata may be added to dict as needed.

In [5]:
# dictionary keys:
print(list(ds[220].keys()))

['signal', 'label', 'file_name']


In [6]:
ds[0]['signal']

Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,P-PDG,PT-P,P-TPT,QBS,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT,state
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-02-01 01:02:07,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10074540.0,,0.0,84.64463,,0.0,119.0781,
2017-02-01 01:02:08,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10074540.0,,0.0,84.63828,,0.0,119.0781,
2017-02-01 01:02:09,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10074540.0,,0.0,84.63194,,0.0,119.0781,
2017-02-01 01:02:10,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10074540.0,,0.0,84.62558,,0.0,119.0781,
2017-02-01 01:02:11,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10074540.0,,0.0,84.61923,,0.0,119.0781,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-02-01 06:59:56,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10014690.0,,0.0,83.44021,,0.0,119.0453,0
2017-02-01 06:59:57,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10014690.0,,0.0,83.45413,,0.0,119.0452,0
2017-02-01 06:59:58,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10014690.0,,0.0,83.46806,,0.0,119.0451,0
2017-02-01 06:59:59,,,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,,10014690.0,,0.0,83.48199,,0.0,119.0450,0


In [7]:
ds[0]['label']

Unnamed: 0_level_0,class
timestamp,Unnamed: 1_level_1
2017-02-01 01:02:07,
2017-02-01 01:02:08,
2017-02-01 01:02:09,
2017-02-01 01:02:10,
2017-02-01 01:02:11,
...,...
2017-02-01 06:59:56,0
2017-02-01 06:59:57,0
2017-02-01 06:59:58,0
2017-02-01 06:59:59,0


## Customizable label selection
May, for instance, unselect any label

In [8]:
ds_config = ParquetDatasetConfig(path=dataset_path, target_class=[1, 2], download=False, clean_data=True) # no need to download if file already exists
ds = ParquetDataset(ds_config) 

In [9]:
# check content
len(ds), list(ds[0].keys()) # should have no 'label'

(166, ['signal', 'label', 'file_name'])

In [10]:
c_ = []
for event in ds:
    c_.append(np.unique(event["label"])[0])
set(c_)

{1, 2}

## Event type splitting
Can select any combination of drawn, simulated and real events

In [11]:
event_types = [EventPrefixEnum.DRAWN, EventPrefixEnum.SIMULATED]
ds_config = ParquetDatasetConfig(path=dataset_path, split=None, event_type=event_types, download=False) # get only drawn
ds = ParquetDataset(ds_config)
len(ds) # should be much smaller /

1109

In [12]:
event_types = [EventPrefixEnum.REAL,]
ds_config = ParquetDatasetConfig(path=dataset_path, split=None, event_type=event_types, download=False) # get only drawn
ds = ParquetDataset(ds_config)
len(ds) # should be much smaller /

1119

## Event class splitting
Can select any combination of class events sourced from class {0, 1, 2, ...}

In [13]:
target_class = [0]
ds_config = ParquetDatasetConfig(path=dataset_path, split=None, target_class=target_class, download=False) # get only drawn
ds = ParquetDataset(ds_config)
len(ds)

594

In [14]:
target_class = [4]
ds_config = ParquetDatasetConfig(path=dataset_path, split=None, target_class=target_class, download=False) # get only drawn
ds = ParquetDataset(ds_config)
len(ds)

343

In [15]:
target_class = [0, 4]
ds_config = ParquetDatasetConfig(path=dataset_path, split=None, target_class=target_class, download=False) # get only drawn
ds = ParquetDataset(ds_config)
len(ds)

937

## File list splitting
May use a list to select loaded events
Useful for customized train/val/test splitting

In [16]:
split_config = ['./0/WELL-00008_20170817140222.parquet',
  './3/SIMULATED_00061.parquet',
  './4/WELL-00004_20140806090103.parquet',
  './6/SIMULATED_00117.parquet',
  './0/WELL-00001_20170201110124.parquet',
  './5/SIMULATED_00138.parquet',
  './4/WELL-00005_20170624070158.parquet',
  './8/SIMULATED_00044.parquet',
  './5/SIMULATED_00303.parquet',
  './9/SIMULATED_00028.parquet',
  './8/SIMULATED_00072.parquet',
  './7/WELL-00022_20180802233838.parquet',
  './0/WELL-00003_20170812110000.parquet',
  './9/SIMULATED_00115.parquet',
  './1/SIMULATED_00025.parquet',
  './9/SIMULATED_00065.parquet',
  './6/SIMULATED_00041.parquet',
  './5/SIMULATED_00329.parquet',
  './4/WELL-00004_20141118160016.parquet',
  './6/SIMULATED_00095.parquet']

In [17]:
ds_config = ParquetDatasetConfig(path=dataset_path, split="list", file_list=split_config) # get only files in split
ds = ParquetDataset(ds_config)
len(ds)

20

In [18]:
ds[0]['signal']

Unnamed: 0_level_0,ABER-CKGL,ABER-CKP,ESTADO-DHSV,ESTADO-M1,ESTADO-M2,ESTADO-PXO,ESTADO-SDV-GL,ESTADO-SDV-P,ESTADO-W1,ESTADO-W2,...,P-JUS-CKGL,P-JUS-CKP,P-MON-CKP,P-PDG,P-TPT,QGL,T-JUS-CKP,T-MON-CKP,T-PDG,T-TPT
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-17 14:02:22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 14:02:23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 14:02:24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 14:02:25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 14:02:26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-17 19:57:09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 19:57:10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 19:57:11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-08-17 19:57:12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.327528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
ds[0]['label']

Unnamed: 0_level_0,class
timestamp,Unnamed: 1_level_1
2017-08-17 14:02:22,0
2017-08-17 14:02:23,0
2017-08-17 14:02:24,0
2017-08-17 14:02:25,0
2017-08-17 14:02:26,0
...,...
2017-08-17 19:57:09,0
2017-08-17 19:57:10,0
2017-08-17 19:57:11,0
2017-08-17 19:57:12,0
