In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
import sys
sys.path.insert(0, "../../")

from hydra import compose, initialize
import pytorch_lightning as L
import torch
import numpy as np
from torch.utils.data import WeightedRandomSampler

from helpers.dataset import get_datasets

### Obtain mean and std of the training set

In [3]:
cfg_dir_path = "../../configs"

with initialize(config_path=cfg_dir_path, version_base="1.1"):
    cfg = compose(config_name="train.yaml", overrides=["username=cizinsky"])

In [74]:
L.seed_everything(cfg.seed);

Seed set to 42


In [75]:
trn_dataset, val_dataset = get_datasets(cfg)

FYI: using the following signal transform: None


In [76]:
all_trn_samples = []
for i in range(len(trn_dataset)):
    all_trn_samples.append(torch.tensor(trn_dataset[i][0]))

trn_samples = torch.stack(all_trn_samples)
trn_samples.shape

torch.Size([11693, 3000, 19])

In [77]:
mean = trn_samples.mean(dim=[0,1])
std = trn_samples.std(dim=[0,1])

In [78]:
mean

tensor([  5.5334,   8.1308,   3.1236,   3.1441,   0.6178,  -1.2991,  -1.3546,
         -2.5935,   4.0302,   8.1645, -20.4061,  -0.3159,   0.1831,  -5.1402,
         -3.1034,   2.0452,   0.2752,  -3.0503,   2.0152], dtype=torch.float64)

In [79]:
std

tensor([111.0848, 164.8593, 126.1502,  89.6225,  77.4358, 145.0964, 122.0255,
         83.0889, 151.1107, 143.5339, 335.1986, 100.2232, 141.4831,  93.8744,
         73.3125,  74.9382, 135.6183,  67.7481, 339.8404], dtype=torch.float64)

In [80]:
torch.save(mean, "../../data/trn_mean.pt")
torch.save(std, "../../data/trn_std.pt")

In [81]:
martins_mean = torch.tensor([ 1.9907e-03, -1.1654e-03,  2.0860e-03, -7.4935e-04,  4.5839e-03, 2.8718e-04, -4.0319e-04, -7.7969e-05, -4.2803e-03, -1.6963e-03, 1.2549e-03, -2.8467e-04,  2.5901e-04,  6.5623e-03, -3.3983e-03, 1.2480e-03,  1.3762e-03, -1.7694e-03, -5.8233e-03], dtype=torch.float64)
martinss_std = torch.tensor([158.2415, 159.4878, 152.8925, 150.5167, 150.6428, 148.8908, 153.9944, 153.8446, 152.9122, 153.9926, 152.8411, 151.9071, 151.2794, 153.7461, 151.1686, 150.3015, 151.7241, 152.1091, 156.2211], dtype=torch.float64)

### Weighted sampler

In [None]:
dataset_tr, dataset_val = get_datasets(cfg)

FYI: using the following signal transform: fft_filtering


In [8]:
# 1) extract labels for each sample in train_dataset
labels = [int(dataset_tr[i][1]) for i in range(len(dataset_tr))]
labels = np.array(labels)

# compute percentages
pos_percentage = np.sum(labels) / len(labels)
neg_percentage = 1 - pos_percentage
print(f"Positive percentage: {pos_percentage:.4f}, Negative percentage: {neg_percentage:.4f}")

Positive percentage: 0.1947, Negative percentage: 0.8053


In [11]:
# 2) compute class counts and weights
class_counts = np.bincount(labels, minlength=2)          # [n_class0, n_class1]
class_weights = 1.0 / (class_counts + 1e-8)              # invert frequency
# e.g. if positives are rare, class_weights[1] >> class_weights[0]

# 3) assign a sample-weight to each example
sample_weights = class_weights[labels]                   # array of length len(dataset)
class_weights, len(sample_weights), sample_weights[:10]

(array([0.0001062 , 0.00043917]),
 11693,
 array([0.0001062, 0.0001062, 0.0001062, 0.0001062, 0.0001062, 0.0001062,
        0.0001062, 0.0001062, 0.0001062, 0.0001062]))

In [13]:
# 4) create the sampler
sampler = WeightedRandomSampler(
    weights=torch.DoubleTensor(sample_weights),
    num_samples=len(sample_weights),   # draw this many samples per epoch
    replacement=True                   # sample with replacement
)