In [1]:
from minerva.models.ssl.tfc import TFC_Model
from minerva.models.nets.tfc import TFC_Backbone
from minerva.models.nets.tnc import TSEncoder
import numpy as np
import pandas as pd
import os
import warnings
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from utils.timefeatures import time_features
import warnings
import lightning as L
from torch.utils.data import DataLoader
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import random
import tqdm


from minerva.models.ssl.tfc import TFC_Model
from minerva.models.nets.tfc import TFC_Backbone
import warnings
import warnings
import lightning as L
from torch.utils.data import DataLoader, ConcatDataset
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger
from minerva.data.datasets.series_dataset import MultiModalSeriesCSVDataset
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


## aLLM4TS Pretrain Dataset

In [2]:
class Dataset_pretrain(Dataset):
    # X is the input data and Y is the input data shifted by stride (16)
    def __init__(
        self,
        pt_data="ETTh1_ETTm1_ETTh2_ETTm2_weather_traffic_electricity_illness",
        patch_len=16,  # Not used
        stride=16,  # Will be used as the shift (y = x + stride)
        root_path="/workspaces/HIAAC-KR-Dev-Container/workspace/aLLM4TS/dataset/",
        flag="train",
        size=[1024, 0, 1024],  # [seq_len, label_len, pred_len]
        features="M",
        data_path="ETTh1.csv",  # Not used
        target="OT",
        scale=True,
        timeenc=1,
        freq="h",
        percent=100,
        return_values="x_y",
    ):
        assert return_values in ["x", "x_y", "x_y_mark"]
        self.return_values = return_values
        self.pt_data = pt_data
        self.patch_len = patch_len
        self.stride = stride
        if size == None:
            raise NotImplementedError
        else:
            self.seq_len = size[0]
            self.label_len = size[1]
            self.pred_len = size[2]

        assert flag in ["train", "test", "val"]
        type_map = {"train": 0, "val": 1, "test": 2}
        self.set_type = type_map[flag]  # 0: train, 1: val, 2: test

        self.features = features
        self.target = target
        self.scale = scale
        self.timeenc = timeenc
        self.freq = freq
        self.percent = percent

        self.root_path = root_path
        self.data_path = data_path
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        pt_datasets = self.pt_data.split("_")

        data_list = []
        data_stamp_list = []
        for pt_dataset in pt_datasets:
            df_raw = pd.read_csv(
                os.path.join(self.root_path, f"{pt_dataset}.csv")
            )
            dataset_len = len(df_raw)
            # Handle datasets
            if "ETTh" in pt_dataset:
                border1s = [
                    0,
                    12 * 30 * 24 - self.seq_len,
                    12 * 30 * 24 + 4 * 30 * 24 - self.seq_len,
                ]
                border2s = [
                    12 * 30 * 24,
                    12 * 30 * 24 + 4 * 30 * 24,
                    12 * 30 * 24 + 8 * 30 * 24,
                ]
                border1 = border1s[self.set_type]
                border2 = border2s[self.set_type]
            elif "ETTm" in pt_dataset:
                border1s = [
                    0,
                    12 * 30 * 24 * 4 - self.seq_len,
                    12 * 30 * 24 * 4 + 4 * 30 * 24 * 4 - self.seq_len,
                ]
                border2s = [
                    12 * 30 * 24 * 4,
                    12 * 30 * 24 * 4 + 4 * 30 * 24 * 4,
                    12 * 30 * 24 * 4 + 8 * 30 * 24 * 4,
                ]
                border1 = border1s[self.set_type]
                border2 = border2s[self.set_type]
            else:
                num_train = int(dataset_len * 0.7)
                num_test = int(dataset_len * 0.2)
                num_vali = dataset_len - num_train - num_test
                border1s = [
                    0,
                    num_train - self.seq_len,
                    dataset_len - num_test - self.seq_len,
                ]
                border2s = [num_train, num_train + num_vali, dataset_len]
                border1 = border1s[self.set_type]
                border2 = border2s[self.set_type]

            # Hendle some options
            if self.set_type == 0:
                border2 = (
                    border2 - self.seq_len
                ) * self.percent // 100 + self.seq_len
            if self.features == "M" or self.features == "MS":
                cols_data = df_raw.columns[1:]
                df_data = df_raw[cols_data]
            elif self.features == "S":
                df_data = df_raw[[self.target]]

            df_data = df_data.values

            # Handle scaling. Scaling is done on each subset of the data, 
            # separately. Train data will be normalizedon train data, 
            # validation data will be normalized on validation data, etc.
            if self.scale:
                train_data = df_data[border1s[0] : border2s[0]]
                self.scaler.fit(train_data)
                data = self.scaler.transform(df_data)
            else:
                data = df_data

            data = data[border1:border2]
            data = data.reshape((len(data) * len(cols_data), 1))
            df_stamp = df_raw[["date"]][border1:border2]
            df_stamp["date"] = pd.to_datetime(df_stamp.date)
            if self.timeenc == 0:
                df_stamp["month"] = df_stamp.date.apply(
                    lambda row: row.month, 1
                )
                df_stamp["day"] = df_stamp.date.apply(lambda row: row.day, 1)
                df_stamp["weekday"] = df_stamp.date.apply(
                    lambda row: row.weekday(), 1
                )
                df_stamp["hour"] = df_stamp.date.apply(lambda row: row.hour, 1)
                data_stamp = df_stamp.drop(["date"], axis=1).values
            elif self.timeenc == 1:
                data_stamp = time_features(
                    pd.to_datetime(df_stamp["date"].values), freq=self.freq
                )
                data_stamp = data_stamp.transpose(1, 0)

            data_list.append(data)
            df_stamp = np.array(
                [data_stamp for i in range(len(cols_data))]
            ).reshape((len(data_stamp) * len(cols_data), 4))
            data_stamp_list.append(df_stamp)

        self.data = np.concatenate(data_list, axis=0)
        self.data_stamp = np.concatenate(data_stamp_list, axis=0)

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_begin + self.stride
        r_end = s_end + self.stride
        # print(s_begin, s_end, r_begin, r_end)

        # print(f"Values: seq_len={self.seq_len}, patch_len={self.patch_len}, stride={self.stride}, s_begin={s_begin}, s_end={s_end}, r_begin={r_begin}, r_end={r_end}")
        # print(f"Seq_x: from {s_begin} to {s_end}")
        # print(f"Seq_y: from {r_begin} to {r_end}")

        seq_x = self.data[s_begin:s_end].swapaxes(0, 1)
        seq_y = self.data[r_begin:r_end].swapaxes(0, 1)

        if self.return_values == "x":
            return seq_x
        elif self.return_values == "x_y":
            return seq_x, seq_y
        else:
            seq_x_mark = self.data_stamp[s_begin:s_end]
            seq_y_mark = self.data_stamp[r_begin:r_end]
            return seq_x, seq_y, seq_x_mark, seq_y_mark

    def __len__(self):
        return len(self.data) - self.seq_len - self.patch_len + 1

    def inverse_transform(self, data):
        return self.scaler.inverse_transform(data)

In [3]:
index = 100
stride = 16

pt_dset_train = Dataset_pretrain(stride=stride, return_values="x_y", flag="train")
x, y = pt_dset_train[index]
print(f"Dataset has {len(pt_dset_train)} samples. x.shape={x.shape}, y.shape={y.shape}")

# Y is same as X shifted by stride (16)
np.testing.assert_allclose(x[0, stride:1024], y[0, 0:1024-stride])
print(f"Test passed! Y is same as X shifted by stride ({stride})")

Dataset has 17878732 samples. x.shape=(1, 1024), y.shape=(1, 1024)
Test passed! Y is same as X shifted by stride (16)


In [4]:
index = 100
stride = 16

pt_dset_val = Dataset_pretrain(stride=stride, return_values="x_y", flag="val")
x, y = pt_dset_val[index]
print(f"Dataset has {len(pt_dset_val)} samples. x.shape={x.shape}, y.shape={y.shape}")

# Y is same as X shifted by stride (16)
np.testing.assert_allclose(x[0, stride:1024], y[0, 0:1024-stride])
print(f"Test passed! Y is same as X shifted by stride ({stride})")

Dataset has 3932428 samples. x.shape=(1, 1024), y.shape=(1, 1024)
Test passed! Y is same as X shifted by stride (16)


In [5]:
index = 100
stride = 16

pr_dset_test = Dataset_pretrain(stride=stride, return_values="x_y", flag="val")
x, y = pr_dset_test[index]
print(f"Dataset has {len(pr_dset_test)} samples. x.shape={x.shape}, y.shape={y.shape}")

# Y is same as X shifted by stride (16)
np.testing.assert_allclose(x[0, stride:1024], y[0, 0:1024-stride])
print(f"Test passed! Y is same as X shifted by stride ({stride})")

Dataset has 3932428 samples. x.shape=(1, 1024), y.shape=(1, 1024)
Test passed! Y is same as X shifted by stride (16)


## ExtraSensory Dataset

In [6]:
class Base_Har_Dataset(Dataset):
    def __init__(
        self,
        root_path: str,
        feature_prefixes=[
            "accel-x",
            "accel-y",
            "accel-z",
            "gyro-x",
            "gyro-y",
            "gyro-z",
        ],
        flag: str = "train",
        size=[1024, 0, 1024],  # [seq_len, label_len, pred_len]
        stride=16,
        return_values="x_y",
        val_split: float = 0.1,
        test_split: float = 0.2,
        seed: int = 42,
        scale=True,
        percent: int = 100,
        
        *args,
        **kwargs,
    ):
        assert return_values in ["x", "x_y"]
        assert flag in ["train", "test", "val"]

        # From arguments
        self.root_dir = Path(root_path)
        self.feature_prefixes = feature_prefixes
        self.flag = flag
        self.seq_len, self.label_len, self.pred_len = size[0], size[1], size[2]
        self.stride = stride
        self.return_values = return_values
        self.seed = seed
        self.scale = scale
        self.percent = percent

        # Split the data
        self.train_split = 1 - val_split - test_split
        self.val_split = val_split
        self.test_split = test_split
        assert (
            self.train_split + self.val_split + self.test_split == 1
        ), "Splits should sum to 1"

        # Read the data and set the data and x, y begin and end indices
        self.scaler = StandardScaler()
        self.data, self.effective_stride = self.__read_data__()

    def _get_dataset_data(self) -> np.ndarray:
        raise NotImplementedError

    def __read_data__(self):
        data = self._get_dataset_data()
        original_shape = data.shape
        effective_stride = self.stride * original_shape[1]
        # Maintin only first dimension and flatten the rest
        print(
            f"Data shape before flatten: {data.shape}. Effective stride: {effective_stride} ({self.stride}*{original_shape[1]})"
        )
        # Flatten each sample in F order
        data = np.ascontiguousarray(
            data.reshape(original_shape[0], -1, order="F")
        )
        print(
            f"Data shape after flatten (F-order): {data.shape}. Min: {data.min()}, Max: {data.max()}"
        )

        if self.scale:
            shape = data.shape
            data = self.scaler.fit_transform(data.reshape(-1, 1)).reshape(shape)
            # Reshape, do the scaling and reshape back
            print(
                f"Data shape after scaling: {data.shape}. Min: {data.min()}, Max: {data.max()}"
            )

        if self.percent < 100:
            # Take only the first self.percent samples
            data = data[: int(data.shape[0] * self.percent / 100)]
            print(f"Data shape after taking {self.percent}%: {data.shape}")

        # Now, we have data of shape (num_samples, num_features * num_timesteps)
        # Where the last dimension is the flattened version of the original data
        # interleaved (F-order)
        return (data, effective_stride)

    def __getitem__(self, index):
        data = self.data[index]
        seq_x = data[: -self.effective_stride]
        seq_y = data[self.effective_stride :]

        # 1024-264 = 760 zeros at the end
        pad_size = max(0, self.seq_len - len(seq_x))
        seq_x = np.pad(seq_x, (0, pad_size), mode="constant", constant_values=0)
        seq_y = np.pad(seq_y, (0, pad_size), mode="constant", constant_values=0)

        # Add a channel dimension at the beginning
        seq_x = seq_x.reshape(1, -1)
        seq_y = seq_y.reshape(1, -1)

        if self.return_values == "x":
            return seq_x
        return seq_x, seq_y

    def __len__(self):
        return len(self.data)
    
    def inverse_transform(self, data):
        shape = data.shape
        data = data.reshape(-1, 1)
        data = self.scaler.inverse_transform(data)
        return data.reshape(shape)

In [7]:
class Dataset_ExtraSensory(Base_Har_Dataset):
    def _get_dataset_data(self) -> np.ndarray:
        rng = random.Random(self.seed)
        # List files and shuffle it using the seed
        files = sorted(list(self.root_dir.glob("es_full.*.csv")))
        rng.shuffle(files)

        # Split the files
        num_train = int(len(files) * self.train_split)
        num_val = int(len(files) * self.val_split)

        if self.flag == "train":
            # First num_train files
            files = files[:num_train]
        elif self.flag == "val":
            # Files from num_train to num_train + num_val
            files = files[num_train : num_train + num_val]
        else:
            # Files from num_train + num_val to the end (last num_test files)
            files = files[num_train + num_val :]

        datasets = []
        tqdm_bar = tqdm.tqdm(enumerate(files), total=len(files), desc="Reading files...")
        for i, f in tqdm_bar:
            tqdm_bar.set_description(f"Reading {f.name}")
            dataset = MultiModalSeriesCSVDataset(
                f, self.feature_prefixes, label=None, features_as_channels=True
            )

            # This should return a numpy array of shape (num_samples, num_features, num_timesteps)
            # In this case, (num_samples, 6, 60)
            data = dataset[:].astype(np.float32)
            datasets.append(data)

        # Data is an array of shape (num_samples, num_features, num_timesteps)
        data = np.concatenate(datasets, axis=0)

        return data

In [8]:
train_es_dataset = Dataset_ExtraSensory(
    root_path="/workspaces/HIAAC-KR-Dev-Container/some_datasets/ES_Raw/",
    stride=stride,
    return_values="x_y",
    flag="train",
)
len(train_es_dataset)

Reading es_full.100000.csv: 100%|██████████| 25/25 [03:20<00:00,  8.04s/it]


Data shape before flatten: (5469725, 6, 60). Effective stride: 96 (16*6)
Data shape after flatten (F-order): (5469725, 360). Min: -79.97591400146484, Max: 92.86629486083984
Data shape after scaling: (5469725, 360). Min: -37.31132888793945, Max: 43.129364013671875


5469725

In [9]:
index = 100
x, y = train_es_dataset[index]
print(f"Dataset has {len(train_es_dataset)} samples. x.shape={x.shape}, y.shape={y.shape}")

np.testing.assert_allclose(
    x[0, train_es_dataset.effective_stride:360-train_es_dataset.effective_stride],
    y[0, 0:360-(train_es_dataset.effective_stride)*2]
)

Dataset has 5469725 samples. x.shape=(1, 1024), y.shape=(1, 1024)


In [10]:
val_es_dataset = Dataset_ExtraSensory(
    root_path="/workspaces/HIAAC-KR-Dev-Container/some_datasets/ES_Raw/",
    stride=stride,
    return_values="x_y",
    flag="val",
)
len(val_es_dataset)

Reading es_full.280000.csv: 100%|██████████| 3/3 [00:32<00:00, 10.71s/it]


Data shape before flatten: (449675, 6, 60). Effective stride: 96 (16*6)
Data shape after flatten (F-order): (449675, 360). Min: -36.92852783203125, Max: 39.63801193237305
Data shape after scaling: (449675, 360). Min: -21.122934341430664, Max: 22.436803817749023


449675

In [11]:
test_es_dataset = Dataset_ExtraSensory(
    root_path="/workspaces/HIAAC-KR-Dev-Container/some_datasets/ES_Raw/",
    stride=stride,
    return_values="x_y",
    flag="test",
)
len(test_es_dataset)

Reading es_full.150000.csv: 100%|██████████| 8/8 [02:40<00:00, 20.10s/it]


Data shape before flatten: (1527750, 6, 60). Effective stride: 96 (16*6)
Data shape after flatten (F-order): (1527750, 360). Min: -52.94058609008789, Max: 98.81349182128906
Data shape after scaling: (1527750, 360). Min: -26.968708038330078, Max: 50.25152587890625


1527750

# DAGHAR

In [12]:
class Dataset_DAGHAR(Base_Har_Dataset):
    def _get_dataset_data(self) -> np.ndarray:
        if self.flag == "train":
            files = self.root_dir.rglob("train.csv")
        elif self.flag == "val":
            files = self.root_dir.rglob("validation.csv")
        else:
            files = self.root_dir.rglob("test.csv")
        files = sorted(list(files))
        print(f"Selected {len(files)} files for {self.flag}: {files}")

        datasets = []
        tqdm_bar = tqdm.tqdm(
            enumerate(files), total=len(files), desc="Reading files..."
        )
        for i, f in tqdm_bar:
            tqdm_bar.set_description(f"Reading {f.name}")
            dataset = MultiModalSeriesCSVDataset(
                f, self.feature_prefixes, label=None, features_as_channels=True
            )

            # This should return a numpy array of shape (num_samples, num_features, num_timesteps)
            # In this case, (num_samples, 6, 60)
            data = dataset[:].astype(np.float32)
            datasets.append(data)

        # Data is an array of shape (num_samples, num_features, num_timesteps)
        data = np.concatenate(datasets, axis=0)

        return data

In [13]:
train_daghar_dataset = Dataset_DAGHAR(
    root_path="/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/",
    stride=stride,
    return_values="x_y",
    flag="train",
)
len(train_daghar_dataset)

Selected 6 files for train: [PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/KuHar/train.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/MotionSense/train.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/RealWorld_thigh/train.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/RealWorld_waist/train.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/UCI/train.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/WISDM/train.csv')]


Reading train.csv: 100%|██████████| 6/6 [00:06<00:00,  1.08s/it]


Data shape before flatten: (36788, 6, 60). Effective stride: 96 (16*6)
Data shape after flatten (F-order): (36788, 360). Min: -66.1843032836914, Max: 72.65300750732422
Data shape after scaling: (36788, 360). Min: -25.996206283569336, Max: 28.536834716796875


36788

In [14]:
index = 100
x, y = train_daghar_dataset[index]
print(f"Dataset has {len(train_daghar_dataset)} samples. x.shape={x.shape}, y.shape={y.shape}")

np.testing.assert_allclose(
    x[0, train_daghar_dataset.effective_stride:360-train_daghar_dataset.effective_stride],
    y[0, 0:360-(train_daghar_dataset.effective_stride)*2]
)

Dataset has 36788 samples. x.shape=(1, 1024), y.shape=(1, 1024)


In [15]:
validation_daghar_dataset = Dataset_DAGHAR(
    root_path="/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/",
    stride=stride,
    return_values="x_y",
    flag="val",
)
len(validation_daghar_dataset)

Selected 6 files for val: [PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/KuHar/validation.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/MotionSense/validation.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/RealWorld_thigh/validation.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/RealWorld_waist/validation.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/UCI/validation.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/WISDM/validation.csv')]


Reading validation.csv: 100%|██████████| 6/6 [00:01<00:00,  4.21it/s]

Data shape before flatten: (5844, 6, 60). Effective stride: 96 (16*6)
Data shape after flatten (F-order): (5844, 360). Min: -50.034881591796875, Max: 52.441993713378906
Data shape after scaling: (5844, 360). Min: -20.453800201416016, Max: 21.43722915649414





5844

In [16]:
test_daghar_dataset = Dataset_DAGHAR(
    root_path="/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/",
    stride=stride,
    return_values="x_y",
    flag="test",
)
len(test_daghar_dataset)

Selected 6 files for test: [PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/KuHar/test.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/MotionSense/test.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/RealWorld_thigh/test.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/RealWorld_waist/test.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/UCI/test.csv'), PosixPath('/workspaces/HIAAC-KR-Dev-Container/some_datasets/DAGHAR/standardized_view/WISDM/test.csv')]


Reading test.csv: 100%|██████████| 6/6 [00:02<00:00,  2.90it/s]

Data shape before flatten: (9990, 6, 60). Effective stride: 96 (16*6)
Data shape after flatten (F-order): (9990, 360). Min: -42.68782043457031, Max: 52.27903366088867
Data shape after scaling: (9990, 360). Min: -16.971426010131836, Max: 20.78189468383789





9990

## Concatenated Dataset

In [17]:
concat_es_allm4ts_train = ConcatDataset([pt_dset_train, train_es_dataset])
print(f"Concatenated train dataset has {len(concat_es_allm4ts_train)} samples")
print(f"Percentage of ES dataset: {len(train_es_dataset)/len(concat_es_allm4ts_train)*100:.2f}%")

Concatenated train dataset has 23348457 samples
Percentage of ES dataset: 23.43%


In [18]:
concat_es_allm4ts_val = ConcatDataset([pt_dset_val, val_es_dataset])
print(f"Concatenated validation dataset has {len(concat_es_allm4ts_val)} samples")
print(f"Percentage of ES dataset: {len(val_es_dataset)/len(concat_es_allm4ts_val)*100:.2f}%")

Concatenated validation dataset has 4382103 samples
Percentage of ES dataset: 10.26%


In [19]:
concat_es_allm4ts_test = ConcatDataset([pr_dset_test, test_es_dataset])
print(f"Concatenated test dataset has {len(concat_es_allm4ts_test)} samples")
print(f"Percentage of ES dataset: {len(test_es_dataset)/len(concat_es_allm4ts_test)*100:.2f}%")

Concatenated test dataset has 5460178 samples
Percentage of ES dataset: 27.98%


In [20]:
concat_daghar_allm4ts_train = ConcatDataset([pt_dset_train, train_daghar_dataset])
print(f"Concatenated train dataset has {len(concat_daghar_allm4ts_train)} samples")
print(f"Percentage of DAGHAR data: {len(train_daghar_dataset) / len(concat_daghar_allm4ts_train) * 100:.2f}%")

Concatenated train dataset has 17915520 samples
Percentage of DAGHAR data: 0.21%


In [21]:
concat_daghar_allm4ts_val = ConcatDataset([pt_dset_val, validation_daghar_dataset])
print(f"Concatenated validation dataset has {len(concat_daghar_allm4ts_val)} samples")
print(f"Percentage of DAGHAR data: {len(validation_daghar_dataset) / len(concat_daghar_allm4ts_val) * 100:.2f}%")

Concatenated validation dataset has 3938272 samples
Percentage of DAGHAR data: 0.15%


In [22]:
concat_daghar_allm4ts_test = ConcatDataset([pr_dset_test, test_daghar_dataset])
print(f"Concatenated test dataset has {len(concat_daghar_allm4ts_test)} samples")
print(f"Percentage of DAGHAR data: {len(test_daghar_dataset) / len(concat_daghar_allm4ts_test) * 100:.2f}%")

Concatenated test dataset has 3942418 samples
Percentage of DAGHAR data: 0.25%


## Saving the Datasets

In [None]:
import gc
save_location = Path("/workspaces/HIAAC-KR-Dev-Container/workspace/aLLM4TS/dataset/")
save_location.mkdir(parents=True, exist_ok=True)

for dset, name in zip(
    [concat_es_allm4ts_train, concat_es_allm4ts_val, concat_es_allm4ts_test, concat_daghar_allm4ts_train, concat_daghar_allm4ts_val, concat_daghar_allm4ts_test],
    ["concat_es_allm4ts_train", "concat_es_allm4ts_val", "concat_es_allm4ts_test", "concat_daghar_allm4ts_train", "concat_daghar_allm4ts_val", "concat_daghar_allm4ts_test"],
):
    gc.collect()
    X, Y = [], []
    for i in tqdm.tqdm(range(len(dset)), total=len(dset), desc=f"Processing {name}"):
        x, y = dset[i]
        x = x.astype(np.float32)
        y = y.astype(np.float32)
        X.append(x)
        Y.append(y)
    print(f"Concatenating arrays")
    X = np.stack(X)
    Y = np.stack(Y)
    print(f"Size of X (in MB): {X.nbytes / 1024 / 1024:.2f}. Size of Y (in MB): {Y.nbytes / 1024 / 1024:.2f}")
    
    fname = save_location / f"{name}.npz"
    
    print(f"Saving to {fname} with X.shape={X.shape}, Y.shape={Y.shape}")
    np.savez_compressed(save_location / f"{name}.npz", X=X, Y=Y)
    print(f"Saved {name} to {save_location / f'{name}.npz'}")
    print()
    
    del X, Y

Processing concat_es_allm4ts_train: 100%|██████████| 23348457/23348457 [03:44<00:00, 104189.05it/s]


Concatenating arrays
Saving to /workspaces/HIAAC-KR-Dev-Container/workspace/aLLM4TS/dataset/concat_es_allm4ts_train.npz with X.shape=(23348457, 1, 1024), Y.shape=(23348457, 1, 1024)
