In [1]:

from utils.path_utils import project_root

import os

import torch

import numpy as np
import pandas as pd

import tqdm


In [2]:

def csv_to_pt():
    patient_dir = os.path.join(project_root(), 'data', 'pt_files')
    patient_files = sorted(os.listdir(patient_dir))
    
    lengths = pd.read_csv(os.path.join(project_root(), 'data', 'processed', 'lengths.txt'), header=None).values
    is_sepsis = pd.read_csv(os.path.join(project_root(), 'data', 'processed', 'is_sepsis.txt'), header=None).values
    
    all_patients = {'samples': [], 'labels': []}
    
    max_time_step = 336
    for idx, (file_name, length, sepsis) in tqdm.tqdm(enumerate(zip(patient_files, lengths, is_sepsis)), 
                                                      desc="Converting csv to .pt format: ", 
                                                      total=len(patient_files)):
        
        file = pd.read_csv(os.path.join(patient_dir, file_name))
        
        pad_width = ((0, max_time_step - len(file)), (0, 0))
        file = np.pad(file, pad_width=pad_width, mode='constant').astype(np.float32)
        
        if len(file) == max_time_step:
            all_patients['samples'].append(torch.from_numpy(file).unsqueeze(0))
            all_patients['labels'].append(torch.tensor(sepsis[0], dtype=torch.float32).unsqueeze(0))
        else:
            raise ValueError(f"Length {length} does not match length of patient {file_name} with length {len(file)}")
    
    print('samples: ', type(all_patients['samples']), 'labels: ', type(all_patients['labels']))
    
    all_patients['samples'] = torch.cat(all_patients['samples'], dim=0)
    all_patients['labels'] = torch.cat(all_patients['labels'], dim=0)
    
    return {'samples': all_patients['samples'], 'labels': all_patients['labels']}, lengths, is_sepsis

all_patients, lengths, is_sepsis = csv_to_pt()


Converting csv to .pt format: 100%|██████████| 20336/20336 [00:49<00:00, 407.71it/s]


samples:  <class 'list'> labels:  <class 'list'>


# Masking Original TimeSeries

In [3]:

import math

def geom_noise_mask_single(L, lm, masking_ratio):
    """
    Randomly create a boolean mask of length `L`, consisting of subsequences of average length lm, masking with 0s a `masking_ratio`
    proportion of the sequence L. The length of masking subsequences and intervals follow a geometric distribution.
    Args:
        L: length of mask and sequence to be masked
        lm: average length of masking subsequences (streaks of 0s)
        masking_ratio: proportion of L to be masked
    Returns:
        (L, ) boolean numpy array intended to mask ('drop') with 0s a sequence of length L
    """
    keep_mask = np.ones(L, dtype=bool)
    p_m = 1 / lm  # probability of each masking sequence stopping. parameter of geometric distribution.
    p_u = p_m * masking_ratio / (
            1 - masking_ratio)  # probability of each unmasked sequence stopping. parameter of geometric distribution.
    p = [p_m, p_u]

    # Start in state 0 with masking_ratio probability
    state = int(np.random.rand() > masking_ratio)  # state 0 means masking, 1 means not masking
    for i in range(L):
        keep_mask[i] = state  # here it happens that state and masking value corresponding to state are identical
        if np.random.rand() < p[state]:
            state = 1 - state

    return keep_mask


def noise_mask(X, masking_ratio=0.25, lm=3, distribution='geometric', exclude_feats=None):
    """
    Creates a random boolean mask of the same shape as X, with 0s at places where a feature should be masked.
    Args:
        X: (seq_length, feat_dim) numpy array of features corresponding to a single sample
        masking_ratio: proportion of seq_length to be masked. At each time step, will also be the proportion of
            feat_dim that will be masked on average
        lm: average length of masking subsequences (streaks of 0s). Used only when `distribution` is 'geometric'.
        distribution: whether each mask sequence element is sampled independently at random, or whether
            sampling follows a markov chain (and thus is stateful), resulting in geometric distributions of
            masked squences of a desired mean length `lm`
        exclude_feats: iterable of indices corresponding to features to be excluded from masking (i.e. to remain all 1s)
    Returns:
        boolean numpy array with the same shape as X, with 0s at places where a feature should be masked
    """
    if exclude_feats is not None:
        exclude_feats = set(exclude_feats)

    if distribution == 'geometric':  # stateful (Markov chain)
        mask = geom_noise_mask_single(X.shape[0] * X.shape[1] * X.shape[2], lm, masking_ratio)
        mask = mask.reshape(X.shape[0], X.shape[1], X.shape[2])
        
    elif distribution == 'masked_tail':
        mask = np.ones(X.shape, dtype=bool)
        for m in range(X.shape[0]):  # feature dimension

            keep_mask = np.zeros_like(mask[m, :], dtype=bool)
            n = math.ceil(keep_mask.shape[1] * (1 - masking_ratio))
            keep_mask[:, :n] = True
            mask[m, :] = keep_mask  # time dimension
            
    elif distribution == 'masked_head':
        mask = np.ones(X.shape, dtype=bool)
        for m in range(X.shape[0]):  # feature dimension

            keep_mask = np.zeros_like(mask[m, :], dtype=bool)
            n = math.ceil(keep_mask.shape[1] * masking_ratio)
            keep_mask[:, n:] = True
            mask[m, :] = keep_mask  # time dimension
    else:  # each position is independent Bernoulli with p = 1 - masking_ratio
        mask = np.random.choice(np.array([True, False]), size=X.shape, replace=True,
                                p=(1 - masking_ratio, masking_ratio))

    return torch.tensor(mask)

def data_transform_masked4cl(sample, masking_ratio, lm, positive_nums=None, distribution='geometric'):
    """Masked time series in time dimension"""

    if positive_nums is None:
        positive_nums = math.ceil(1.5 / (1 - masking_ratio))
        
    sample = sample.permute(0, 2, 1)  # (batch_size, channels, time_steps)
    
    # Creating the batch in #positive_nums sets
    sample_repeat = sample.repeat(positive_nums, 1, 1)  # (batch_size*positive_num, channels, time steps)

    mask = noise_mask(sample_repeat, masking_ratio, lm, distribution=distribution)
    x_masked = mask * sample_repeat

    return x_masked.permute(0, 2, 1), mask.permute(0, 2, 1)

# data_masked_m, mask = data_transform_masked4cl(all_patients['samples'][:32], 0.5, 3, positive_nums=1, distribution='geometric')


In [4]:
from torch.utils.data import Dataset


class Load_Dataset(Dataset):
    
    def __init__(self, dataset, TSlength_aligned, training_mode, target_dataset_size=64, subset=False):
        
        super(Load_Dataset, self).__init__()
        self.training_mode = training_mode
        
        X_train = dataset["samples"]
        y_train = dataset["labels"]
        
        # shuffle
        data = list(zip(X_train, y_train))
        np.random.shuffle(data)
        
        X_train, y_train = zip(*data)
        X_train, y_train = torch.stack(list(X_train), dim=0), torch.stack(list(y_train), dim=0)

        if len(X_train.shape) < 3:
            X_train = X_train.unsqueeze(2)

        if X_train.shape.index(min(X_train.shape)) != 1:  # make sure the Channels in second dim
            X_train = X_train.permute(0, 2, 1)

        """Align the TS length between source and target datasets"""
        # X_train = X_train[:, :1, :int(config.TSlength_aligned)] # take the first 178 samples
        X_train = X_train[:, :, :int(TSlength_aligned)]

        """Subset for debugging"""
        if subset == True:
            
            subset_size = target_dataset_size *10
            
            """if the dimension is larger than 178, take the first 178 dimensions. 
                If multiple channels, take the first channel"""
            X_train = X_train[:subset_size] 
            y_train = y_train[:subset_size]

        if isinstance(X_train, np.ndarray):
            self.x_data = torch.from_numpy(X_train)
            self.y_data = torch.from_numpy(y_train).long()
        else:
            self.x_data = X_train
            self.y_data = y_train

        self.len = X_train.shape[0]

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len
    

In [6]:
sepsis_data = Load_Dataset(all_patients, TSlength_aligned=300, training_mode='pre_train', 
                           target_dataset_size=64, subset=False)
train_loader = torch.utils.data.DataLoader(dataset=sepsis_data, batch_size=32, shuffle=True, 
                                           drop_last=True, num_workers=4)

for i, j in train_loader:
    
    break

print(i.shape)


torch.Size([32, 133, 300])


# Args

In [12]:
import argparse

parser = argparse.ArgumentParser()

home_dir = os.getcwd()
parser.add_argument('--run_description', default='run1', type=str, help='Experiment Description')
parser.add_argument('--seed', default=2023, type=int, help='seed value')

parser.add_argument('--training_mode', default='pre_train', type=str, help='pre_train, fine_tune')
parser.add_argument('--pretrain_dataset', default='SleepEEG', type=str,
                    help='Dataset of choice: SleepEEG, FD_A, HAR, ECG')
parser.add_argument('--target_dataset', default='Epilepsy', type=str,
                    help='Dataset of choice: Epilepsy, FD_B, Gesture, EMG')

parser.add_argument('--logs_save_dir', default='experiments_logs', type=str, help='saving directory')
parser.add_argument('--device', default='cuda', type=str, help='cpu or cuda')
parser.add_argument('--home_path', default=home_dir, type=str, help='Project home directory')
parser.add_argument('--subset', action='store_true', default=False, help='use the subset of datasets')
parser.add_argument('--log_epoch', default=5, type=int, help='print loss and metrix')
parser.add_argument('--draw_similar_matrix', default=10, type=int, help='draw similarity matrix')
parser.add_argument('--pretrain_lr', default=0.0001, type=float, help='pretrain learning rate')
parser.add_argument('--lr', default=0.0001, type=float, help='learning rate')
parser.add_argument('--use_pretrain_epoch_dir', default=None, type=str,
                    help='choose the pretrain checkpoint to finetune')
parser.add_argument('--pretrain_epoch', default=10, type=int, help='pretrain epochs')
parser.add_argument('--finetune_epoch', default=300, type=int, help='finetune epochs')

parser.add_argument('--masking_ratio', default=0.5, type=float, help='masking ratio')
parser.add_argument('--positive_nums', default=3, type=int, help='positive series numbers')
parser.add_argument('--lm', default=3, type=int, help='average masked lenght')

parser.add_argument('--finetune_result_file_name', default="finetune_result.json", type=str,
                    help='finetune result json name')
parser.add_argument('--temperature', type=float, default=0.2, help='temperature')

args, unknown = parser.parse_known_args()


In [80]:
class Config(object):
    def __init__(self):
        
        # Pre-training
        self.input_channels = 133
        self.kernel_size = 3
        self.stride = 3
        self.dropout = 0.2
        self.final_out_channels = 2
        self.CNNoutput_channel = 10
        
        # Optimizer
        self.beta1 = 0.9
        self.beta2 = 0.99
    

# Model Pre-Training

In [81]:
def model_pretrain(model, model_optimizer, model_scheduler, train_loader, configs, args, device):
    total_loss = []
    total_cl_loss = []
    total_rb_loss = []

    model.train()
    for batch_idx, (data, labels) in enumerate(train_loader):  # data shape: (batch_size, seqs, channels)

        model_optimizer.zero_grad()

        # When masking, data is reshaped to (batch_size, channel, seqs) - Inside the data_transform_masked4cl()
        data_masked_m, mask = data_transform_masked4cl(data, args.masking_ratio, args.lm, args.positive_nums)
        data_masked_om = torch.cat([data, data_masked_m], 0)  # (batch_size, seqs, channels)

        data, labels, data_masked_om = data.float().to(device), labels.float().to(device), data_masked_om.float().to(
            device)

        # Produce embeddings of original and masked samples  (data_masked_om = data samples + masked samples)
        loss, loss_cl, loss_rb = model(data_masked_om, pretrain=True)
        
        return loss, loss_cl, loss_rb

    #     loss.backward()
    #     model_optimizer.step()
    # 
    #     total_loss.append(loss.item())
    #     total_cl_loss.append(loss_cl.item())
    #     total_rb_loss.append(loss_rb.item())
    # 
    # total_loss = torch.tensor(total_loss).mean()
    # total_cl_loss = torch.tensor(total_cl_loss).mean()
    # total_rb_loss = torch.tensor(total_rb_loss).mean()
    # 
    # model_scheduler.step()
    # 
    # return total_loss, total_cl_loss, total_rb_loss


In [82]:
from simmtm.model import TFC

configs = Config()

model = TFC(configs, args).to('cuda')
# print(model)

# Pre-Training
params_group = [{'params': model.parameters()}]
model_optimizer = torch.optim.Adam(params_group, lr=args.pretrain_lr, betas=(configs.beta1, configs.beta2),
                                       weight_decay=0)
model_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=model_optimizer, T_max=args.pretrain_epoch)
loss, loss_cl, loss_rb = model_pretrain(model=model, model_optimizer=model_optimizer, model_scheduler=model_scheduler, 
               train_loader=train_loader, configs=configs, args=args, device='cuda')



RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x450 and 30x256)

In [83]:
model

TFC(
  (conv_block1): Sequential(
    (0): Conv1d(133, 32, kernel_size=(3,), stride=(3,), padding=(1,), bias=False)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Dropout(p=0.2, inplace=False)
  )
  (conv_block2): Sequential(
    (0): Conv1d(32, 64, kernel_size=(8,), stride=(1,), padding=(4,), bias=False)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (conv_block3): Sequential(
    (0): Conv1d(64, 30, kernel_size=(8,), stride=(1,), padding=(4,), bias=False)
    (1): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=2, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (dense): Sequential(
    (0): Linear(in_feat

# SleepEEG

In [76]:
# from simmtm.config_files.SleepEEG_Configs import Config 
# 
# original_configs = Config()
# model = TFC(original_configs, args)
# 
# model

In [None]:
# datasetpath = os.path.join(project_root(), 'data', 'simmtm_datasets', 'datasets', 'classification', 'dataset', 'Gesture', 'train.pt')
# print(datasetpath)
# 
# samples = torch.load(datasetpath)['samples']
# labels = torch.load(datasetpath)['labels']
# 
# print(type(samples), samples.shape)
# print(type(labels), labels.shape)

# data=Load_Dataset(dataset=torch.load(datasetpath), TSlength_aligned=178, training_mode='', target_dataset_size=64, subset=False)
# train_loader = torch.utils.data.DataLoader(dataset=data, batch_size=32, shuffle=True, 
#                                            drop_last=True, num_workers=0)
# for i, j in train_loader:
#     break
#     
# print(i.shape)
