In [2]:
import h5py
import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy.stats import pearsonr
from torch.utils.data import Dataset
from tqdm import tqdm
import torch

class ECG_Dataset(Dataset):
    def __init__(self, 
                tracings_file_path,
                labels_file_path,
                start = 0,
                end = -1):
        self.f = h5py.File(tracings_file_path, 'r')

        # Get tracings
        self.trace_ids = np.array(self.f['exam_id'])[start:end]
        self.tracings = self.f['tracings']

        # Defining start and end
        self.start = start
        self.end   = (end if end != -1 else len(self.tracings)-1)

        # Get labels
        labels_df = pd.read_csv(labels_file_path)
        self.labels    = {labels_df["exam_id"][i]:labels_df["classe"][i] for i in range(len(self.tracings))}

    def __len__(self):
        return self.end - self.start

    def __getitem__(self, idx):
        # Get tracing
        tracing_idx = self.start + idx
        tracing = np.transpose(self.tracings[tracing_idx])
        
        # Get label
        label = self.labels[self.trace_ids[idx]]

        return tracing, label

In the next cell I try to apply a tucker decomposition, witch is (or should be) a SVD equivalent but for tensors. 

```
from tensorly.decomposition import tucker
import tensorly as tl

with h5py.File('../data/train_dccweek2023.h5', 'r') as f:
    X = f['tracings'][:5000]
    

# aplica o HOSVD mantendo a primeira dimensão
factors = tucker(X, rank=[5000, 2000, 2])

# reconstrói o tensor original usando os componentes principais
X_hosvd = tl.kruskal_to_tensor(factors)

# verifica as dimensões do novo tensor
print(X_hosvd.shape)
```

The above cell uses just way too much memory and takes an eternity to run, and since it is 10x smaller than the full dataset and it needs to be loaded all at once, it cant be a good solution.

The next test should be to iter over each sample individualy, and reduce its dimension using PCA from (1, 4096, 12) to (1, 4096, 2) and build a new training dataset.

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = ECG_Dataset('../data/train_dccweek2023.h5',
                      '../data/train_dccweek2023-labels.csv',
                      start = 0, end = -1)

In [4]:
dataset.tracings

<HDF5 dataset "tracings": shape (51432, 4096, 12), type "<f4">

In [5]:
from sklearn.decomposition import PCA
import numpy as np

# Create sample data
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

# Initialize PCA with the number of components you want to keep
pca = PCA(n_components=2)

# Fit the PCA model on the data
pca.fit(X)

# Transform the data to the new coordinate system
X_transformed = pca.transform(X)

# Print the original data and the transformed data
print("Original Data:")
print(X)
print("Transformed Data:")
print(X_transformed)

Original Data:
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
Transformed Data:
[[ 7.79422863e+00 -1.69309011e-15]
 [ 2.59807621e+00 -6.38378239e-16]
 [-2.59807621e+00  6.38378239e-16]
 [-7.79422863e+00  1.69309011e-15]]


In [41]:
n = 3
pca = PCA(n_components=2)
pca.fit(dataset.tracings[n])
transformed = pca.transform(dataset.tracings[n])

In [42]:
transformed

array([[-1.0921661 , -0.34675354],
       [-1.077329  , -0.3770971 ],
       [-1.068098  , -0.38688132],
       ...,
       [ 0.14603835,  0.5883383 ],
       [ 0.16443667,  0.57407165],
       [ 0.18471095,  0.56191504]], dtype=float32)

In [43]:
transformed.shape

(4096, 2)

In [55]:
class ECG_Dataset_PCA(ECG_Dataset):
    def __getitem__(self, idx):
        # Get tracing
        tracing_idx = self.start + idx
        tracing = self.tracings[tracing_idx]
        
        pca = PCA(n_components=2)
        pca.fit(tracing)
        transformed = np.transpose(pca.transform(tracing))
        
        # Get label
        label = self.labels[self.trace_ids[idx]]

        return transformed, label

In [56]:
pca_dataset = ECG_Dataset_PCA('../data/train_dccweek2023.h5',
                      '../data/train_dccweek2023-labels.csv',
                      start = 0, end = -1)

In [57]:
pca_dataset[1]

(array([[ 0.00643325,  0.00643325,  0.00643325, ...,  0.00643325,
          0.00643325,  0.00643325],
        [-0.00280107, -0.00280107, -0.00280107, ..., -0.00280107,
         -0.00280107, -0.00280107]], dtype=float32),
 0)