In [1]:
import os
import torch
import numpy as np
from sklearn.utils import shuffle

In [2]:
folder = '../../Datasets/'

In [3]:
files = [f for f in sorted(os.listdir(folder)) if f in ["SEED.npz", "DEAP.npz", "DREAMER.npz", "SEED_IV.npz"]]

# 1. Loading all datasets

In [4]:
X, Y, S = [], [], []
for file in files:
    print(f"loading {file}")
    dataset = np.load(os.path.join(folder, file))
    X.append(dataset['X'])
    Y.append(dataset['Y'])
    S.append(dataset['subject'].astype(int))

loading DEAP.npz
loading DREAMER.npz
loading SEED.npz
loading SEED_IV.npz


# 2. Analyzing the number of samples

In [5]:
MIN_SAMPLES_PER_CLASS = []
for file, x,y,s in zip(files, X,Y,S):
    print("_____________________________")
    print(f" ######### {file} ######### ")
    print(" S | Negative | Neutral | Positive | Total | Least per Class")
    min_samples_per_class = []
    for subjectID in np.unique(s):
        min_samples_per_class.append(min( sum(y[s==subjectID]==-1), sum(y[s==subjectID]==0), sum(y[s==subjectID]==1 ) ))
        print(f"{int(subjectID) :02d} |   {sum(y[s==subjectID]==-1) :05d}  |  {sum(y[s==subjectID]==0) :05d}  |  {sum(y[s==subjectID]==1) :05d}   | {sum(s==subjectID)}  | {min( sum(y[s==subjectID]==-1), sum(y[s==subjectID]==-0), sum(y[s==subjectID]==1 ) )}")
    MIN_SAMPLES_PER_CLASS.append(min_samples_per_class)

_____________________________
 ######### DEAP.npz ######### 
 S | Negative | Neutral | Positive | Total | Least per Class
01 |   00638  |  00174  |  00348   | 1160  | 174
02 |   00551  |  00145  |  00464   | 1160  | 145
03 |   00522  |  00638  |  00000   | 1160  | 0
04 |   00725  |  00174  |  00261   | 1160  | 174
05 |   00493  |  00348  |  00319   | 1160  | 319
06 |   00319  |  00493  |  00348   | 1160  | 319
07 |   00435  |  00348  |  00377   | 1160  | 348
08 |   00493  |  00348  |  00319   | 1160  | 319
09 |   00522  |  00319  |  00319   | 1160  | 319
10 |   00551  |  00261  |  00348   | 1160  | 261
11 |   00580  |  00377  |  00203   | 1160  | 203
12 |   00638  |  00087  |  00435   | 1160  | 87
13 |   00725  |  00058  |  00377   | 1160  | 58
14 |   00580  |  00232  |  00348   | 1160  | 232
15 |   00609  |  00319  |  00232   | 1160  | 232
16 |   00754  |  00261  |  00145   | 1160  | 145
17 |   00522  |  00348  |  00290   | 1160  | 290
18 |   00435  |  00348  |  00377   | 1160  | 348


Total samples:

samples_per_subject_per_class * n_subjects * n_classes * n_datasources

= 93 * 15 * 3 * 4

= 16740

Samples per datasource: 4185


Test-Set: 3 subjects per datasource: 3*3*93 (*4) = 837 (3348)


Precision: 1,2 % (0,03%)

# 3. Sampling

In [6]:
n_subjects = 15

In [7]:
for s in S:
    if len(np.unique(s)) < n_subjects: raise ValueError ("The dataset holds samples of fewer than you want to use. Try again with a lower value of `number_of_subjects`")

In [8]:
n_samples_per_subject_per_class = min([sorted(n, reverse=True)[n_subjects-1] for n in MIN_SAMPLES_PER_CLASS])

In [9]:
X_SAMPLED, Y_SAMPLED, S_SAMPLED = [], [], []
for x,y,s,min_samples_per_class in zip(X,Y,S,MIN_SAMPLES_PER_CLASS):
    eligible_subject_IDs = [n+1 for n in np.argsort(min_samples_per_class) if min_samples_per_class[n]>=n_samples_per_subject_per_class]
    used_subjectIDs = np.random.choice(eligible_subject_IDs, n_subjects, replace=False)
    x,y,s = shuffle(x,y,s, random_state=77)
    x_sampled, y_sampled, s_sampled = [], [], []
    for subjectID in used_subjectIDs:
        for class_idx in [-1, 0, 1]:
            x_sampled.append(x[np.bitwise_and(s==subjectID, y==class_idx)][:n_samples_per_subject_per_class])
            y_sampled.append(y[np.bitwise_and(s==subjectID, y==class_idx)][:n_samples_per_subject_per_class])
            s_sampled.append(s[np.bitwise_and(s==subjectID, y==class_idx)][:n_samples_per_subject_per_class])
    X_SAMPLED.append(np.concatenate(x_sampled))
    Y_SAMPLED.append(np.concatenate(y_sampled))
    S_SAMPLED.append(np.concatenate(s_sampled))

In [10]:
for file, x_sampled,y_sampled,s_sampled in zip(files, X_SAMPLED,Y_SAMPLED,S_SAMPLED):
    print("_____________________________")
    print(f" ######### {file} ######### ")
    print(" S | Negative | Neutral | Positive | Total | Least per Class")
    for subjectID in np.unique(s_sampled):
        print(f"{int(subjectID) :02d} |   {sum(y_sampled[s_sampled==subjectID]==-1) :05d}  |  {sum(y_sampled[s_sampled==subjectID]==0) :05d}  |  {sum(y_sampled[s_sampled==subjectID]==1) :05d}   | {sum(s_sampled==subjectID)}  | {min( sum(y_sampled[s_sampled==subjectID]==-1), sum(y_sampled[s_sampled==subjectID]==-0), sum(y_sampled[s_sampled==subjectID]==1 ) )}")

_____________________________
 ######### DEAP.npz ######### 
 S | Negative | Neutral | Positive | Total | Least per Class
02 |   00093  |  00093  |  00093   | 279  | 93
05 |   00093  |  00093  |  00093   | 279  | 93
07 |   00093  |  00093  |  00093   | 279  | 93
08 |   00093  |  00093  |  00093   | 279  | 93
10 |   00093  |  00093  |  00093   | 279  | 93
15 |   00093  |  00093  |  00093   | 279  | 93
18 |   00093  |  00093  |  00093   | 279  | 93
20 |   00093  |  00093  |  00093   | 279  | 93
22 |   00093  |  00093  |  00093   | 279  | 93
26 |   00093  |  00093  |  00093   | 279  | 93
27 |   00093  |  00093  |  00093   | 279  | 93
28 |   00093  |  00093  |  00093   | 279  | 93
29 |   00093  |  00093  |  00093   | 279  | 93
30 |   00093  |  00093  |  00093   | 279  | 93
32 |   00093  |  00093  |  00093   | 279  | 93
_____________________________
 ######### DREAMER.npz ######### 
 S | Negative | Neutral | Positive | Total | Least per Class
02 |   00093  |  00093  |  00093   | 279  | 93
0

# 4. Assure samples of each data-sources at same index have same label [Optional]

In [52]:
#shuffled_indices = torch.randperm(len(X_SAMPLED[0]))
#for datasource_idx, (file, x_sampled,y_sampled,s_sampled) in enumerate(zip(files, X_SAMPLED,Y_SAMPLED,S_SAMPLED)):
#    sorted_indices = np.argsort(y_sampled)
#    X_SAMPLED[datasource_idx] = x_sampled[sorted_indices][shuffled_indices]
#    Y_SAMPLED[datasource_idx] = y_sampled[sorted_indices][shuffled_indices]
#    S_SAMPLED[datasource_idx] = s_sampled[sorted_indices][shuffled_indices]
#
#
## --> Don't do it, as this is only possible if we have labeled data across ALL data-sources

# 5. Verifications

In [42]:
for dataset_idx, (x_sampled,y_sampled,s_sampled, x, y, s) in enumerate(zip(X_SAMPLED,Y_SAMPLED,S_SAMPLED, X, Y, S)):
    print(f"Analyzing dataset number {dataset_idx}")
# check if some exemplary x_sampled per data-source are correctly labeled:
    exemplary_sample_IDs = np.random.choice(x_sampled.shape[0], size=10, replace=False)
    for exemplary_x, exemplary_y, exemplary_s in zip(x_sampled[exemplary_sample_IDs], y_sampled[exemplary_sample_IDs], s_sampled[exemplary_sample_IDs]):
        idx = (x==exemplary_x).all(axis=(1,2)).nonzero()[0]
        # 1. occur only once in x_sampled
        if len(idx)>1:
            continue
        assert len(idx)==1
        # 2. the right label
        assert y[idx] == exemplary_y
        # 3. the right subject id
        assert s[idx] == exemplary_s

Analyzing dataset number 0
Analyzing dataset number 1
Analyzing dataset number 2
Analyzing dataset number 3


# 6. Split-off Test-Set

Take the samples of 3 subjects per dataset as test-set

In [284]:
X_train, Y_train, S_train = [], [], []
X_test, Y_test, S_test = [], [], []

In [313]:
ELIGIBLE_TEST_SUBJECTS = [np.unique(s).tolist() for s in S_SAMPLED]

In [316]:
TEST_SUBJECTS = [sorted(np.random.choice(eligible_test_subjects, 3, replace=False)) for eligible_test_subjects in ELIGIBLE_TEST_SUBJECTS]

In [347]:
for x,y,s,test_subjects in zip(X_SAMPLED, Y_SAMPLED, S_SAMPLED, TEST_SUBJECTS):
    condition = np.bitwise_or(np.bitwise_or(s==test_subjects[0], s==test_subjects[1]), s==test_subjects[2])
    X_train.append(x[np.invert(condition)])
    Y_train.append(y[np.invert(condition)])
    S_train.append(s[np.invert(condition)])
    X_test.append(x[condition])
    Y_test.append(y[condition])
    S_test.append(s[condition])

# 7. Insights

In [359]:
X_test[0].shape

(837, 32, 256)

# 8. Save Dataset

In [368]:
torch.save(
    {
        "X_train": X_train,
        "Y_train": Y_train,
        "S_train": S_train
    },
    '../Datasets/multisource_train.pt'
)

In [369]:
torch.save(
    {
        "X_test": X_test,
        "Y_test": Y_test,
        "S_test": S_test
    },
    '../Datasets/multisource_test.pt'
)

In [371]:
verify_train_data = torch.load('../Datasets/multisource_train.pt')
verify_X_train = verify_train_data['X_train']
verify_Y_train = verify_train_data['Y_train']

In [379]:
(X_train[2]==verify_X_train[2]).all()

True

: 

In [2]:
dataset = torch.load('../Datasets/multisource_train.pt')

In [3]:
X_train = dataset['X_train']
Y_train = dataset['Y_train']

In [11]:
np.unique(Y_train[3], return_counts=True)

(array([-1.,  0.,  1.]), array([1116, 1116, 1116]))