In [8]:
import h5py
import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy.stats import pearsonr
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
from sklearn.decomposition import PCA

class ECG_Dataset(Dataset):
    def __init__(self, 
                tracings_file_path,
                labels_file_path,
                start = 0,
                end = -1):
        self.f = h5py.File(tracings_file_path, 'r')

        # Get tracings
        self.trace_ids = np.array(self.f['exam_id'])[start:end]
        self.tracings = self.f['tracings']

        # Defining start and end
        self.start = start
        self.end   = (end if end != -1 else len(self.tracings)-1)

        # Get labels
        labels_df = pd.read_csv(labels_file_path)
        self.labels    = {labels_df["exam_id"][i]:labels_df["classe"][i] for i in range(len(self.tracings))}

    def __len__(self):
        return self.end - self.start

    def __getitem__(self, idx):
        # Get tracing
        tracing_idx = self.start + idx
        tracing = np.transpose(self.tracings[tracing_idx])
        
        # Get label
        label = self.labels[self.trace_ids[idx]]

        return tracing, label

In the next cell I try to apply a tucker decomposition, witch is (or should be) a SVD equivalent but for tensors. 

```
from tensorly.decomposition import tucker
import tensorly as tl

with h5py.File('../data/train_dccweek2023.h5', 'r') as f:
    X = f['tracings'][:5000]
    

# aplica o HOSVD mantendo a primeira dimensão
factors = tucker(X, rank=[5000, 2000, 2])

# reconstrói o tensor original usando os componentes principais
X_hosvd = tl.kruskal_to_tensor(factors)

# verifica as dimensões do novo tensor
print(X_hosvd.shape)
```

The above cell uses just way too much memory and takes an eternity to run, and since it is 10x smaller than the full dataset and it needs to be loaded all at once, it cant be a good solution.

The next test should be to iter over each sample individualy, and reduce its dimension using PCA from (1, 4096, 12) to (1, 4096, 2) and build a new training dataset.

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = ECG_Dataset('../data/train_dccweek2023.h5',
                      '../data/train_dccweek2023-labels.csv',
                      start = 0, end = -1)

In [10]:
dataset.tracings

<HDF5 dataset "tracings": shape (51432, 4096, 12), type "<f4">

In [26]:
class ECG_Dataset_Features(ECG_Dataset):
    def __getitem__(self, idx):
        # Get tracing
        tracing_idx = self.start + idx
        tracing = self.tracings[tracing_idx]
        
        # Adding pca of each signal to features
        pca = PCA(n_components=2)
        pca.fit(tracing)
        transformed = np.transpose(pca.transform(tracing))
        
        # Adding fft analysis
        harmonics = []
        for signal in np.transpose(tracing):
            dft = np.fft.fft(signal)
            spectrum = np.abs(dft)
            T = 1/4096
            N = 4096
            f = np.fft.fftfreq(N, T)
            frequencias = f[:N // 2]
            amplitudes = np.abs(dft)[:N // 2] * 1 / N
            best_5 = sorted(zip(amplitudes, frequencias), key = lambda x: x[0])[:5]
            harmonics.append(best_5)
        
        label = self.labels[self.trace_ids[idx]]

        return transformed, harmonics, label

In [27]:
pca_dataset = ECG_Dataset_Features('../data/train_dccweek2023.h5',
                      '../data/train_dccweek2023-labels.csv',
                      start = 0, end = -1)

In [28]:
pca_dataset[0]

(array([[ 0.00132167,  0.00132167,  0.00132167, ...,  0.00132167,
          0.00132167,  0.00132167],
        [-0.00620164, -0.00620164, -0.00620164, ..., -0.00620164,
         -0.00620164, -0.00620164]], dtype=float32),
 [[(6.89322247574178e-07, 2039.0),
   (1.8502183420517583e-06, 2020.0),
   (2.1952691244240816e-06, 1916.0),
   (2.941896539934087e-06, 625.0),
   (3.068819175589408e-06, 1697.0)],
  [(1.0637980531465152e-07, 1951.0),
   (1.467753041207812e-07, 1824.0),
   (1.9297881857952446e-07, 1803.0),
   (1.9485257723717336e-07, 1923.0),
   (2.0419500242756645e-07, 1870.0)],
  [(5.899598739481765e-07, 1060.0),
   (1.2422680153014398e-06, 1609.0),
   (1.371204365222291e-06, 1694.0),
   (1.378336734449453e-06, 1675.0),
   (1.7462074346731318e-06, 1583.0)],
  [(1.3158089795101346e-07, 1953.0),
   (2.655521338384478e-07, 1759.0),
   (2.891059721951097e-07, 1847.0),
   (3.193347297695923e-07, 1893.0),
   (3.8046228062081805e-07, 1932.0)],
  [(7.621093773273258e-07, 2039.0),
   (1.19351