In [1]:
%load_ext autoreload
%autoreload 2
from dataclasses import dataclass
from pathlib import Path
import pandas as pd
import sys
sys.path.append('../../')
from omegaconf import OmegaConf
from dacite import from_dict

from tsbench.tslib.loading.csv_loader import CSVTimeSeriesDataset,MultiCSVTimeSeriesDataset
from tsbench.tslib.traindataset_generator import TimeSeriesTrainDatasetGeneratorConfig, TimeSeriesTrainDatasetGenerator

### with normalized Dataset

Then we need to create a configuration for tslib. For this we need to mirror the config defined in `tslib/traindataset_generator.py`:

In [3]:
cfg = """ 
pipeline:
  dataset:
    name: multicsvloader
    kwargs:
      data_folder: '../../datafiles/stocknewseventssentiment-snes-10'
      meta_columns: []
  windowing:
    window_size: 20 # each time series for the model will have length 10
    stride: 5 # each time series will be shifted by 5
  # TODO add normalization of the features
  # store normalizer values in a file and load them in the normalizer
  normalizer:
    normalizer_file: ../../notebooks_students/mustafa/normalizer_snef.json
  feature_selector:
    select_features:
        - 'Open'
        - 'High'
        - 'Low'
        - 'Close'
        - 'Adj Close'
        - 'Volume'
        - 'News - All News Volume'
        - 'News - Volume'
        - 'News - Positive Sentiment'
        - 'News - Negative Sentiment'
        - 'News - New Products'
        - 'News - Layoffs'
        - 'News - Analyst Comments'
        - 'News - Stocks'
        - 'News - Dividends'
        - 'News - Corporate Earnings'
        - 'News - Mergers & Acquisitions'
        - 'News - Store Openings'
        - 'News - Product Recalls'
        - 'News - Adverse Events'
        - 'News - Personnel Changes'
        - 'News - Stock Rumors'
    drop_features: []
  target_generator:
    name: many_to_many_regression
    kwargs:
      input_features:
        - 'Close'
        - 'Adj Close'
        - 'Volume'
        - 'News - All News Volume'
        - 'News - Volume'
        - 'News - Positive Sentiment'
        - 'News - Negative Sentiment'
        - 'News - New Products'
        - 'News - Layoffs'
        - 'News - Analyst Comments'
        - 'News - Stocks'
        - 'News - Dividends'
        - 'News - Corporate Earnings'
        - 'News - Mergers & Acquisitions'
        - 'News - Store Openings'
        - 'News - Product Recalls'
        - 'News - Adverse Events'
        - 'News - Personnel Changes'
        - 'News - Stock Rumors'
      target_features:
        - 'Open'
        - 'High'
        - 'Low'
      target_shift: 0
      added_meta_features: []
split: 
  name: random_split
  kwargs:
    lengths: [0.8, 0.2] # train, val
"""
cfg = OmegaConf.create(cfg)

cfg = from_dict(data_class=TimeSeriesTrainDatasetGeneratorConfig, data=OmegaConf.to_container(cfg))

In [4]:
cfg

TimeSeriesTrainDatasetGeneratorConfig(pipeline=ComposedTimeSeriesDatasetConfig(dataset=NameAndKwargs(name='multicsvloader', kwargs={'data_folder': '../../datafiles/stocknewseventssentiment-snes-10', 'meta_columns': []}), windowing=TimeSeriesWindowDatasetConfig(window_size=20, stride=5, initial_offset=0, end_offset=0, future_steps=0, past_steps=0), partition_filter=NameAndKwargs(name='no_filter', kwargs={}), feature_selector=FeatureSelectorConfig(select_features=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'News - All News Volume', 'News - Volume', 'News - Positive Sentiment', 'News - Negative Sentiment', 'News - New Products', 'News - Layoffs', 'News - Analyst Comments', 'News - Stocks', 'News - Dividends', 'News - Corporate Earnings', 'News - Mergers & Acquisitions', 'News - Store Openings', 'News - Product Recalls', 'News - Adverse Events', 'News - Personnel Changes', 'News - Stock Rumors'], drop_features=[]), normalizer=NormalizerConfig(normalizer_values={}, normalize_fea

In [5]:
ds_gen = TimeSeriesTrainDatasetGenerator(cfg)
ds_gen.generate_dataset()

Sample: 100%|█████████████████████████████████████| 1/1 [00:00<00:00, 14.08it/s]
Sample: 100%|█████████████████████████████████████| 1/1 [00:00<00:00, 23.82it/s]
Generating window index: 100%|████████████████████| 1/1 [00:00<00:00,  2.45it/s]
Total number of dropped timesteps due to windowing: 1


In [6]:
train_ds = ds_gen.train_split

In [7]:
train_ds

<tsbench.tslib.postprocessing.dataset_subset.TimeSeriesTrainDatasetSubset at 0x101a1cf10>

In [8]:
train_ds.get_meta_data_summary()

Unnamed: 0,ComposedTimeSeriesDataset_index,MultiCSVTimeSeriesDataset_index,key,index,num_timesteps
0,13648,0,data_wi[|68240:68240|68240:68260|68260:68260|],13648,20
1,9808,0,data_wi[|49040:49040|49040:49060|49060:49060|],9808,20
2,7036,0,data_wi[|35180:35180|35180:35200|35200:35200|],7036,20
3,4780,0,data_wi[|23900:23900|23900:23920|23920:23920|],4780,20
4,10212,0,data_wi[|51060:51060|51060:51080|51080:51080|],10212,20
...,...,...,...,...,...
34843,37673,0,data_wi[|188365:188365|188365:188385|188385:18...,37673,20
34844,39013,0,data_wi[|195065:195065|195065:195085|195085:19...,39013,20
34845,28893,0,data_wi[|144465:144465|144465:144485|144485:14...,28893,20
34846,2748,0,data_wi[|13740:13740|13740:13760|13760:13760|],2748,20


In [9]:
train_ds[0]

{'x': tensor([[-0.2371, -0.2445, -0.1889,  1.4560,  0.0584, -0.1180, -0.1115, -0.1446,
          -0.0290,  0.2745,  0.2365, -0.2022,  3.2323, -0.0993, -0.0840, -0.0362,
          -0.1795, -0.1169, -0.0499],
         [-0.2402, -0.2474, -0.2099,  0.1594, -0.2133, -0.1534, -0.2363, -0.1446,
          -0.0290,  0.0495,  0.0217, -0.2022,  0.2206, -0.1882, -0.0840, -0.0362,
           0.0972, -0.1169, -0.0499],
         [-0.2427, -0.2499, -0.2268, -1.1109, -0.1930, -0.1534, -0.2363, -0.1446,
          -0.0290, -0.0469, -0.0704, -0.2022, -0.0357, -0.1882, -0.0840, -0.0362,
           0.2633, -0.1169, -0.0499],
         [-0.2404, -0.2476, -0.2651,  0.3611,  0.0503, -0.1180, -0.1115, -0.1446,
          -0.0290,  0.4673,  0.4206,  0.2226, -0.1639, -0.0993, -0.0840, -0.0362,
           0.2079, -0.1169, -0.0499],
         [-0.2396, -0.2469, -0.2198,  0.6258, -0.1403, -0.1534,  1.1982, -0.1446,
          -0.0290, -0.0147,  0.0217,  0.2226, -0.2280, -0.0104, -0.0840, -0.0362,
          -0.2348, -0.1

In [10]:
train_ds[0]['x'].shape

torch.Size([20, 19])

In [11]:
len(train_ds), len(ds_gen.validation_split), type(ds_gen.validation_split)

(34848,
 8711,
 tsbench.tslib.postprocessing.dataset_subset.TimeSeriesTrainDatasetSubset)

In [12]:
train_ds.input_dim, train_ds.context_length

(19, 20)

In [13]:
train_ds.target_dim, train_ds.target_length

(3, 20)

In [14]:
# the original timeseries
train_ds.dataset.dataset.dataset.dataset.get_meta_data_summary()

Sample: 100%|█████████████████████████████████████| 1/1 [00:00<00:00, 17.29it/s]


Unnamed: 0,MultiCSVTimeSeriesDataset_index,key,index,num_timesteps
0,0,data,0,217811


In [15]:
from tqdm import tqdm
import torch 

all_series = None
for ds in [train_ds]:
    for idx in tqdm(range(100)):
        ts = ds[idx]['x']
        if all_series is None:
            all_series = ts
        else:
            all_series = torch.cat((all_series, ts), dim=0)

100%|█████████████████████████████████████████| 100/100 [00:27<00:00,  3.63it/s]
