# Track 1

In [1]:
import os
import sys
import copy
import torch
import wandb
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.impute import KNNImputer
from monai.config import KeysCollection
from torch.nn import ConstantPad1d, ReplicationPad1d
from monai.transforms import Randomizable, MapTransform, Transform
from monai.data import CacheDataset, DataLoader, DistributedSampler
from torch.utils.data import RandomSampler, SequentialSampler, BatchSampler




In [6]:
valid_range = {
    "acc_X" : (-19.6, 19.6),
    "acc_Y" : (-19.6, 19.6),
    "acc_Z" : (-19.6, 19.6),
    "gyr_X" : (-573, 573),
    "gyr_Y" : (-573, 573),
    "gyr_Z" : (-573, 573),
    "heartRate" : (0, 255),
    "rRInterval" : (0, 2000),
}

def validate(window):
    invalid_filter = pd.Series(False, window.index)
    for col, (a, b) in valid_range.items():
        invalid_filter = invalid_filter | (window[col]<a) | (window[col]>b)
    return 1- (len(window[invalid_filter])/len(window))

def get_observations(root_dir, user, split, sample, w_size_h, w_stride_h, val_percentage, drop_short_sequences=True):
    obs = []
    # open data file
    if split == 'test':
        data_file = Path('test_data')/Path(sample)/"data.csv"
        step_file = Path('test_data')/Path(sample)/"step.csv"
        # save user
        user_id = -1 # to be predicted
    else:
        data_file = Path('training_data')/Path(user)/Path(split)/Path(sample)/"data.csv"
        step_file = Path('training_data')/Path(user)/Path(split)/Path(sample)/"step.csv"
        user_id = int(user.split("_")[1])
        
    w_size = int(w_size_h*60*12)
    w_stride = int(w_stride_h*60*12)
    data = pd.read_csv(Path(root_dir)/data_file)
    if len(data) < w_size:
        if split == 'train' and drop_short_sequences:
            return obs
        # Consider short windows in validation and test
        else:
            validity = validate(data)
            return [{
                'data_file' : data_file,
                'step_file' : step_file,
                'label' : user_id,
                'valid' : validity >= val_percentage,
                'start_data_row' : 0,
                'end_data_row' : len(data) 
            }]
    
    # slide windows
    for start in range(0, len(data)-w_size, w_stride):
        stop = start + w_size # excluded
        window = data.loc[start:stop-1] # upperbound is included
        # check validity
        validity = validate(window)
        obs.append({
            'data_file' : data_file,
            'step_file' : step_file,
            'label' : user_id,
            'valid' : validity >= val_percentage,
            'start_data_row' : start,
            'end_data_row' : stop 
        })
    return obs
    
def create_dataset_list(root_dir, split='train', w_size_h=3, w_stride_h=3, val_percentage=2.5/3, drop_short_sequences=True):
    dataset_list = []

    if split == 'test':
        # iterate over observations
        for sample in tqdm(os.listdir(root_dir/'test_data')):
            # open data file
            obs = get_observations(root_dir, -1, split, sample, w_size_h, w_stride_h, val_percentage, drop_short_sequences)
            dataset_list += obs
    else:
        # iterate over observations
        for user in tqdm(os.listdir(root_dir/'training_data')):
            # get user dir
            split_dir = Path(root_dir)/Path('training_data')/Path(user)/Path(split)
            for sample in tqdm(os.listdir(split_dir)):
                # open data file
                #sample_dir = Path(split_dir)/Path(sample)
                obs = get_observations(root_dir, user, split, sample, w_size_h, w_stride_h, val_percentage, drop_short_sequences)
                dataset_list += obs

    return dataset_list

def _create_offsets(x):
    if len(x[x.valid]) == 0:
        return list(zip(x.start_data_row, x.end_data_row))
    return list(zip(x[x.valid].start_data_row, x[x.valid].end_data_row))

def save_dataset(root_dir, output_dir, split='train', w_size_h=3, w_stride_h=3, val_percentage=2.5/3, drop_short_sequences=True):
    dataset_list = create_dataset_list(root_dir, split, w_size_h, w_stride_h, val_percentage, drop_short_sequences)
    # create dataframe
    dataset = pd.DataFrame(dataset_list)
    if split != 'train':
        # group by sample_id (data_file) and create a list of valid offsets
        records = dataset.groupby('data_file').apply(lambda x: {
            'data_file': x.data_file.iloc[0],
            'step_file': x.step_file.iloc[0],
            'label' : x.label.iloc[0],
            'valid' : True,
            'offsets': _create_offsets(x),
        # TODO: don't filter if the list is empty
        })
        dataset = pd.DataFrame().from_records(records.to_list())
    dataset.to_csv(output_dir/f"{split}_dataset.csv")

def compute_metrics():

    # init wandb run
    run = wandb.init()
    split = 'train'
    w_size_h = wandb.config.w_size_h
    w_stride_h = wandb.config.w_stride_h
    val_percentage = wandb.config.val_percentage

    if w_stride_h > w_size_h:
        wandb.log({
            'w_size_h' : w_size_h,
            'w_stride_h' : w_stride_h,
            'val_percentage' : 0,
            'min_count' : 0,
            'l2' : 1000
        })
        return

    # create dataset
    dataset_list = create_dataset_list(root_dir, split, w_size_h, w_stride_h, val_percentage)
    # create dataframe
    dataset = pd.DataFrame(dataset_list)

    # filter valid
    dataset = dataset[dataset.valid]

    # plot distribution
    count_distribution = dataset.label.value_counts().sort_index().to_numpy()
    min_count = min(count_distribution)
    num_samples = sum(count_distribution)
    distribution = count_distribution / num_samples
    # compute uniform
    uniform_probability = len(count_distribution / num_samples)
    uniform = np.zeros(len(count_distribution)) + uniform_probability
    l2 = np.linalg.norm(uniform - distribution)

    wandb.log({
        'w_size_h' : w_size_h,
        'w_stride_h' : w_stride_h,
        'val_percentage' : val_percentage,
        'min_count' : min_count,
        'l2' : l2
    })

def run_sweep():
    # Define sweep config
    sweep_configuration = {
        'method': 'grid',
        'name': 'distribution_sweep',
        'metric': {'goal': 'minimize', 'name': 'l2'},
        'parameters': 
        {
            'w_size_h': {'values': [1, 3, 6, 9, 12, 15, 18]},
            'w_stride_h': {'values': [1, 3, 6, 9]},
            'val_percentage': {'values': [0.75, 2.5/3, 0.9]}
        }
    }

    # Initialize sweep by passing in config. (Optional) Provide a name of the project.
    sweep_id = wandb.sweep(sweep=sweep_configuration, project='spgc_data_distribution')

    # Start sweep job.
    wandb.agent(sweep_id, function=compute_metrics, project='spgc_data_distribution')

In [8]:
# where do you want to place?
output_dir = Path("../data/track1/")
# data dir
root_dir = Path("../../datasets/SPGC_challenge_track_1_release")

split = 'val'
w_size_h = 1.5
w_stride_h = 1.5
val_percentage = 1/3
drop_short_sequences = True

save_dataset(root_dir, output_dir, split, w_size_h, w_stride_h, val_percentage, drop_short_sequences)
#run_sweep()

  0%|          | 0/521 [00:00<?, ?it/s]

## Create Folds for Cross Validation
The general idea is to use validation set as a fold and then separate train into 4 separate folds

In [3]:
# data dir
root_dir = Path("../../datasets/SPGC_challenge_track_1_release")

for user in os.listdir(root_dir/Path("training_data")):
    train_samples = root_dir/Path("training_data")/Path(user)/Path("train")
    val_samples = root_dir/Path("training_data")/Path(user)/Path("val")
    train_samples = len(os.listdir(train_samples))
    val_samples = len(os.listdir(val_samples))
    total = train_samples+ val_samples
    #print(train_samples/total, val_samples/total)

In [4]:
# get paths

def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

num_folds=4
folds = {x : [] for x in range(num_folds+1)}

for user in os.listdir(root_dir/Path("training_data")):
    train_samples = os.listdir(root_dir/Path("training_data")/Path(user)/Path("train"))
    train_samples = list(map(lambda x: Path("training_data")/Path(user)/Path("train")/Path(x), train_samples))
    #print(len(train_samples))
    user_folds = list(split(train_samples, num_folds))
    #print(sum([len(fold) for fold in user_folds]))
    for i in range(num_folds):
        folds[i].extend(user_folds[i])
    val_samples = os.listdir(root_dir/Path("training_data")/Path(user)/Path("val"))
    val_samples = list(map(lambda x: Path("training_data")/Path(user)/Path("val")/Path(x), val_samples))
    folds[num_folds].extend(val_samples)

In [5]:
for fold in folds:
    print(len(folds[fold]))

593
580
573
558
495


In [6]:
def parse_path(path):
    path = str(path)
    _, user, split, id = path.split("/")
    user = int(user.split("_")[1])
    id = int(id)
    return user, split, id

parse_path(folds[0][0])

(41, 'train', 37)

In [4]:
valid_range = {
    "acc_X" : (-19.6, 19.6),
    "acc_Y" : (-19.6, 19.6),
    "acc_Z" : (-19.6, 19.6),
    "gyr_X" : (-573, 573),
    "gyr_Y" : (-573, 573),
    "gyr_Z" : (-573, 573),
    "heartRate" : (0, 255),
    "rRInterval" : (0, 2000),
}

def validate(window):
    invalid_filter = pd.Series(False, window.index)
    for col, (a, b) in valid_range.items():
        invalid_filter = invalid_filter | (window[col]<a) | (window[col]>b)
    return 1- (len(window[invalid_filter])/len(window))

def get_observations(root_dir, sample_path, split, w_size_h, w_stride_h, val_percentage, drop_short_sequences=True):
    obs = []
    # open data file
    data_file = Path(sample_path)/"data.csv"
    step_file = Path(sample_path)/"step.csv"
    user_id, _, id = parse_path(sample_path)
  
    w_size = int(w_size_h*60*12)
    w_stride = int(w_stride_h*60*12)
    data = pd.read_csv(Path(root_dir)/data_file)
    
    if len(data) < w_size:
        if split == 'train' and drop_short_sequences:
            return obs
        # Consider short windows in validation and test
        else:
            validity = validate(data)
            return [{
                'data_file' : data_file,
                'step_file' : step_file,
                'label' : user_id,
                'valid' : validity >= val_percentage,
                'start_data_row' : 0,
                'end_data_row' : len(data) 
            }]
    
    # slide windows
    for start in range(0, len(data)-w_size, w_stride):
        stop = start + w_size # excluded
        window = data.loc[start:stop-1] # upperbound is included
        # check validity
        validity = validate(window)
        obs.append({
            'data_file' : data_file,
            'step_file' : step_file,
            'label' : user_id,
            'valid' : validity >= val_percentage,
            'start_data_row' : start,
            'end_data_row' : stop 
        })

    return obs
    
def create_fold(root_dir, folds, eval_fold_id=0, w_size_h=3, w_stride_h=3, val_percentage_train=2.5/3, val_percentage_eval=1/3, drop_short_sequences=True):

    eval_samples = folds[eval_fold_id]
    train_samples = []
    for k in folds:
        if k != eval_fold_id:
            train_samples.extend(folds[k])

    train_observations = []
    # iterate over observations
    for sample in tqdm(train_samples):
        obs = get_observations(root_dir, sample, 'train', w_size_h, w_stride_h, val_percentage_train, drop_short_sequences)
        train_observations += obs

    eval_observations = []
    # iterate over observations
    for sample in tqdm(eval_samples):
        obs = get_observations(root_dir, sample, 'val', w_size_h, w_stride_h, val_percentage_eval, drop_short_sequences)
        eval_observations += obs

    return train_observations, eval_observations

In [8]:

def _create_offsets(x):
    if len(x[x.valid]) == 0:
        return list(zip(x.start_data_row, x.end_data_row))
    return list(zip(x[x.valid].start_data_row, x[x.valid].end_data_row))

def save_dataset(root_dir, output_dir, folds, eval_fold_id=0, w_size_h=3, w_stride_h=3, val_percentage_train=2.5/3, val_percentage_eval=1/3, drop_short_sequences=True):
    train_obs, eval_obs = create_fold(root_dir, folds, eval_fold_id, w_size_h, w_stride_h, val_percentage_train, val_percentage_eval, drop_short_sequences)
    # create train dataframe
    dataset = pd.DataFrame(train_obs)
    dataset.to_csv(output_dir/f"fold{eval_fold_id}/train_dataset.csv")

    # create eval dataframe
    dataset = pd.DataFrame(eval_obs)
    # group by sample_id (data_file) and create a list of valid offsets
    records = dataset.groupby('data_file').apply(lambda x: {
        'data_file': x.data_file.iloc[0],
        'step_file': x.step_file.iloc[0],
        'label' : x.label.iloc[0],
        'valid' : True,
        'offsets': _create_offsets(x),
    # TODO: don't filter if the list is empty
    })
    dataset = pd.DataFrame().from_records(records.to_list())
    dataset.to_csv(output_dir/f"fold{eval_fold_id}/val_dataset.csv")
    

In [None]:
# where do you want to place?
output_dir = Path("../data/track1/")
save_dataset(root_dir, output_dir, folds, eval_fold_id=0)
save_dataset(root_dir, output_dir, folds, eval_fold_id=1)
save_dataset(root_dir, output_dir, folds, eval_fold_id=2)
save_dataset(root_dir, output_dir, folds, eval_fold_id=3)
save_dataset(root_dir, output_dir, folds, eval_fold_id=4)

## Cache Dataset Creation

In [22]:
class EPreventionDataset(CacheDataset):
    def __init__(self, split_path, split, transforms, max_samples=None, cache_num = sys.maxsize, cache_rate=1.0, num_workers=1):    
        
        self.split = split
        self.max_samples = max_samples
        
        data = self._generate_data_list(split_path/f"{split}_dataset.csv")

        super().__init__(data, transforms, cache_num=cache_num, cache_rate=cache_rate, num_workers=num_workers)
        
     
    #split data in train, val and test sets in a reproducible way
    def _generate_data_list(self, split_path):

        # open csv with observations
        data_list = pd.read_csv(split_path, index_col=0, nrows=self.max_samples)
        # filter valid
        data_list = data_list[data_list.valid]
        # save ditribution
        count_distribution = data_list.label.value_counts().sort_index().to_numpy()
        num_samples = len(data_list)
        self.distribution = count_distribution / num_samples

        return data_list.to_dict('records')  
    
    def get_label_proportions(self):

        return self.distribution

In [24]:
ds = EPreventionDataset(split_path=Path("../data/track1/fold0"), split='val', transforms=None, max_samples=100)
ds[0]

Loading dataset: 100%|██████████| 100/100 [00:00<00:00, 898137.90it/s]


{'data_file': 'training_data/user_00/train/03/data.csv',
 'step_file': 'training_data/user_00/train/03/step.csv',
 'label': 0,
 'valid': True,
 'offsets': '[(0, 2160), (2160, 4320), (4320, 6480), (6480, 8640), (8640, 10800)]'}

## Transforms


In [8]:
class AppendRootDirD(MapTransform):
    def __init__(self, keys: KeysCollection, root_dir):
        super().__init__(keys)
        self.root_dir = root_dir
    
    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            d[k] = os.path.join(self.root_dir,d[k])
        return d

In [9]:
class LoadDataD(MapTransform):
    
    def __init__(self, keys: KeysCollection, split, use_sleeping):
        super().__init__(keys)
        self.split = split
        if use_sleeping:
            self.cols = ['acc_X', 'acc_Y', 'acc_Z', 'gyr_X', 'gyr_Y', 'gyr_Z', 'heartRate', 'rRInterval', 'timecol', 'sleeping']
        else:
            self.cols = ['acc_X', 'acc_Y', 'acc_Z', 'gyr_X', 'gyr_Y', 'gyr_Z', 'heartRate', 'rRInterval', 'timecol']


    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            if self.split == 'train':
                d['data'] = pd.read_csv(data[k],
                    skiprows=lambda x : x in range(1, data['start_data_row']+1),
                    nrows=data['end_data_row']-data['start_data_row'],
                    usecols=self.cols)
            else:
                d['data'] = pd.read_csv(data[k], usecols=self.cols)
            del d[k] 
        if 'valid' in d.keys(): del d['valid']
        if 'start_data_row' in d.keys(): del d['start_data_row']
        if 'end_data_row' in d.keys(): del d['end_data_row']
        return d

In [10]:
class ExtractTimeD(MapTransform):

    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            d['time'] = d[k].timecol.astype('datetime64[ns]')
            d[k].drop('timecol', inplace=True, axis=1)
        return d

class DeleteTimeD(MapTransform):

    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            del d[k]
        return d

In [11]:
class LoadStepD(MapTransform):
    
    def __init__(self, keys: KeysCollection, use_calories):
        super().__init__(keys)
        if use_calories:
            self.cols = ['start_time', 'end_time', 'totalSteps', 'distance', 'calories']
        else:
            self.cols = ['start_time', 'end_time', 'totalSteps', 'distance']


    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            d['step'] = pd.read_csv(data[k],
                usecols=self.cols)
            del d[k] 
        return d

In [12]:
class ConvertToSequenceD(MapTransform):
    
    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            if 'calories' in d[k].columns:
                vm, vs, c = self._create_step_sequences(d['time'], d[k])
                d['step'] = np.stack([vm, vs, c], axis=0)
            else:
                vm, vs = self._create_step_sequences(d['time'], d[k])
                d['step'] = np.stack([vm, vs], axis=0)
        if 'time' in d.keys():
            del d['time']
        return d

    def _create_step_sequences(self, time, step):
    
        # create empty velocity vectors
        velocity_m = np.zeros(len(time))
        velocity_s = np.zeros(len(time))
        if 'calories' in step.columns:
            calories = np.zeros(len(time))

        # add a column of period
        step['start_time'] = pd.to_datetime(step['start_time'])
        step['end_time'] = pd.to_datetime(step['end_time'])
        step['period'] = step['end_time']-step['start_time']

        for _, s in step.iterrows():
            # get the period index in time array
            idx = np.where((time > s.start_time) & (time < s.end_time))[0]
            if len(idx) != 0:
                # assign velocity in m/s in the period
                velocity_m[idx] = s.distance / s.period.seconds
                # assign velocity in steps/5s in the period
                velocity_s[idx] = s.totalSteps / len(idx)
                if 'calories' in step.columns:
                    # assign calories in 5s
                    calories[idx] = s.calories / len(idx)
        if 'calories' in step.columns:
            return velocity_m, velocity_s, calories
        return velocity_m, velocity_s

In [13]:
class ToArrayD(MapTransform):
    
    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            d[k] = d[k].to_numpy().transpose()
        return d

In [14]:
valid_ranges = {
    "acc_X" : (-19.6, 19.6),
    "acc_Y" : (-19.6, 19.6),
    "acc_Z" : (-19.6, 19.6),
    "gyr_X" : (-573, 573),
    "gyr_Y" : (-573, 573),
    "gyr_Z" : (-573, 573),
    "heartRate" : (0, 255),
    "rRInterval" : (0, 2000),
    "sleeping" : (0, 1)
}

class NormalizeDataD(MapTransform):
    
    def __init__(self, keys: KeysCollection, valid_ranges, use_sleeping):
        super().__init__(keys)
        self.valid_ranges = valid_ranges
        vr_keys = list(valid_ranges.keys())
        if not use_sleeping:
            vr_keys.remove('sleeping')
        self.min = np.array([valid_ranges[k][0] for k in vr_keys])
        self.max = np.array([valid_ranges[k][1] for k in vr_keys])

    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            d[k] = ((d[k].transpose() - self.min)/(self.max - self.min)).transpose()
        return d

class NormalizeStepD(MapTransform):

    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            mn = d[k].min(axis=1)
            mx = d[k].max(axis=1)
            r = mx - mn
            r[np.where(r==0)] = 1
            d[k] = ((d[k].transpose() - mn) / r).transpose()
        return d

In [15]:
class InterpolateDataD(MapTransform):

    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            d[k] = self._impute_invalid_values(d[k])
        return d


    def _impute_invalid_values(self, signals):
        
        # save input
        input_signals = copy.deepcopy(signals)

        # set a treshold for detect artifacts
        signals[np.where(signals<0)] = -1.
        signals[np.where(signals>1)] = -1.
        
        # interpolate
        imputer = KNNImputer(missing_values=-1., n_neighbors=5, weights="distance")
        signals = imputer.fit_transform(signals)

        # Preserve the dimensionality of short invalid sequences
        if signals.shape[0] == 0:
            return input_signals
            
        return signals

In [16]:
class ConcatenateStepD(MapTransform):

    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            d[k] = np.concatenate([d[k], d['step']], axis=0)
        if 'step' in d.keys():
            del d['step']
        return d

In [17]:
class PadShortSequenceD(MapTransform):
    
    def __init__(self, keys: KeysCollection, output_size, padding, mode):
        super().__init__(keys)
        assert padding in ['replication', 'zero'], "Select Proper Padding Mode: Allowed same and zero"
        assert mode in ['head', 'center', 'tail'], "Select Proper Mode: Allowed head, center and tail"
        self.output_size = output_size
        self.padding = padding
        self.mode = mode
        
    def __call__(self, data):
        d = copy.deepcopy(data)
        w_in = d['data'].shape[-1]
        if w_in >= self.output_size:
            return d
        pad_size = self.output_size - w_in
        if self.mode == 'head':
            padding = (pad_size, 0)
        elif self.mode == 'tail':
            padding = (0, pad_size)
        elif self.mode == 'center' and pad_size%2==0:
            padding = pad_size//2
        elif self.mode == 'center' and pad_size%2==1:
            padding = (pad_size//2, pad_size//2+1)
        pad_fn = self._get_pad_fn(padding)
        for k in self.keys:
            d[k] = pad_fn(d[k])
        return d

    def _get_pad_fn(self, padding):
        return ConstantPad1d(padding, 0) if self.padding == 'zero' else ReplicationPad1d(padding)

In [18]:
class CreateVotingBatchD(MapTransform):
    
    def __init__(self, keys: KeysCollection):
        super().__init__(keys)
        
    def __call__(self, data):
        d = copy.deepcopy(data)
        offsets = eval(d['offsets'])
        for k in self.keys:
            windows = [d[k][:, start:stop].unsqueeze(0) for (start, stop) in offsets]
            d[k] = torch.cat(windows, dim=0)
        if 'offsets' in d.keys():
            del d['offsets']
        return d

In [None]:
from monai.transforms import Compose, ToTensorD

#from transforms import AppendRootDirD
class AppendRootDirD(MapTransform):

    def __init__(self, keys: KeysCollection, root_dir):
        super().__init__(keys)
        self.root_dir = root_dir
    
    def __call__(self, data):
        d = copy.deepcopy(data)
        for k in self.keys:
            d[k] = os.path.join(self.root_dir,d[k])
        return d

root_dir = Path("../../datasets/SPGC_challenge_track_1_release/")
split = 'val'
w_size_h = 3
w_stride_h = 3

transforms = [
        ToTensorD(['label'],dtype=torch.long),
        AppendRootDirD(['data_file', 'step_file'], root_dir),
        LoadDataD(['data_file'], split=split, use_sleeping=True),
        ExtractTimeD(['data']),
        LoadStepD(['step_file'], use_calories=True),
        ConvertToSequenceD(['step']),
        ToArrayD(['data']),
        NormalizeDataD(['data'], valid_ranges=valid_ranges, use_sleeping=True),
        InterpolateDataD(['data']),
        NormalizeStepD(['step']),
        ConcatenateStepD(['data']),
        ToTensorD(['data'], dtype=torch.float),
        CreateVotingBatchD(['data']),
        PadShortSequenceD(['data'], output_size=w_size_h*60*12, padding='replication', mode='center')
]

transforms = Compose(transforms)
ds = EPreventionDataset(split_path=Path("../data/track1/fold0"), split='val', transforms=transforms)


In [32]:
ds[0]['data'].size()

torch.Size([5, 12, 2160])

In [33]:
def get_loader(args):
    if not os.path.isdir(args.root_dir):
        raise ValueError("Root directory root_dir must be a directory.")
    
    basics_1 = [
            ToTensorD(['label'],dtype=torch.long),
            AppendRootDirD(['data_file', 'step_file'], args.root_dir)
    ]

    train_load = [LoadDataD(['data_file'], split='train', use_sleeping=args.use_sleeping)]
    val_load = [LoadDataD(['data_file'], split='val', use_sleeping=args.use_sleeping)]

    basics_2 = [       
            ExtractTimeD(['data']),
            ToArrayD(['data']),
            NormalizeDataD(['data'], valid_ranges=args.valid_ranges, use_sleeping=args.use_sleeping),
            InterpolateDataD(['data']),
    ]

    if args.use_steps:
        basics_2 = [
                *basics_2,
                LoadStepD(['step_file'], use_calories=args.use_calories),
                ConvertToSequenceD(['step']),
                NormalizeStepD(['step']),
                ConcatenateStepD(['data']),
            ]
    else:
        basics_2 = [
                *basics_2,
                DeleteTimeD(['time'])
            ]
        
    basics_2 = [
                *basics_2,
                ToTensorD(['data'], dtype=torch.float)
            ]        

    train_transforms = [
        *basics_1,
        *train_load,
        *basics_2
    ]

    val_transforms = [
        *basics_1,
        *val_load,
        *basics_2,
        CreateVotingBatchD(['data']),
        PadShortSequenceD(['data'], output_size=args.window_size, padding=args.padding_mode, mode=args.padding_loc)
    ]

    train_transforms = Compose(train_transforms)
    val_transforms = Compose(val_transforms)
    
    dataset = {}
    dataset["train"] = EPreventionDataset(split_path=args.split_path, split='train', transforms=train_transforms, max_samples=args.max_samples, cache_rate=args.cache_rate)
    dataset["val"] = EPreventionDataset(split_path=args.split_path, split='val', transforms=val_transforms, max_samples=args.max_samples, cache_rate=args.cache_rate)
    dataset["test"] = EPreventionDataset(split_path=args.split_path, split='test', transforms=val_transforms, max_samples=args.max_samples, cache_rate=args.cache_rate)
    
    samplers = {}
    if args.distributed:
        samplers["train"] = DistributedSampler(dataset["train"], shuffle=True)
        samplers["val"] = DistributedSampler(dataset["val"], shuffle=False)
        samplers["test"] = DistributedSampler(dataset["test"], shuffle=False)
    else:
        samplers["train"] = RandomSampler(dataset["train"])
        samplers["val"] = SequentialSampler(dataset["val"])
        samplers["test"] = SequentialSampler(dataset["test"])

    batch_sampler = {}
    batch_sampler['train'] = BatchSampler(samplers["train"], args.batch_size, drop_last=True)
    batch_sampler['val'] = BatchSampler(samplers["val"], 1, drop_last=False)
    batch_sampler['test'] = BatchSampler(samplers["test"], 1, drop_last=False)

    loaders = {}
    loaders["train"] = DataLoader(dataset["train"], batch_sampler = batch_sampler['train'], num_workers=2, pin_memory=True, persistent_workers=True)
    loaders["val"] = DataLoader(dataset["val"], batch_sampler = batch_sampler['val'], num_workers=2, pin_memory=True, persistent_workers=True)
    loaders["test"] = DataLoader(dataset["test"], batch_sampler = batch_sampler['test'], num_workers=2, pin_memory=True, persistent_workers=True)
    
    loss_weights = torch.Tensor(dataset["train"].get_label_proportions())

    return loaders, samplers, loss_weights


In [36]:
import argparse

args={}

args['use_sleeping'] = True
args['use_steps'] = True
args['use_calories'] = True
args['valid_ranges'] = valid_ranges
args['window_size'] = 3*60*12
args['padding_mode'] = 'replication' # or 'zero'
args['padding_loc'] = 'center' # or 'head' or 'tail'

args['root_dir'] = Path("../../datasets/SPGC_challenge_track_1_release") # where data are placed
args['split_path'] = Path("../data/track1/fold0")

args['max_samples'] = 20
args['cache_rate'] = 1.0

args['distributed'] = False
args['batch_size'] = 8

args = argparse.Namespace(**args)


In [37]:
loaders, samples, w_loss = get_loader(args)

Loading dataset: 100%|██████████| 20/20 [00:01<00:00, 13.15it/s]
Loading dataset: 100%|██████████| 20/20 [00:09<00:00,  2.09it/s]
Loading dataset: 100%|██████████| 20/20 [00:10<00:00,  1.98it/s]


In [38]:
for batch in tqdm(loaders['val']):
    print(batch['data'].size())

  0%|          | 0/20 [00:00<?, ?it/s]

torch.Size([1, 5, 12, 2160])
torch.Size([1, 6, 12, 2160])
torch.Size([1, 3, 12, 2160])
torch.Size([1, 3, 12, 2160])
torch.Size([1, 5, 12, 2160])
torch.Size([1, 6, 12, 2160])
torch.Size([1, 5, 12, 2160])
torch.Size([1, 3, 12, 2160])
torch.Size([1, 6, 12, 2160])
torch.Size([1, 3, 12, 2160])
torch.Size([1, 4, 12, 2160])
torch.Size([1, 6, 12, 2160])
torch.Size([1, 6, 12, 2160])
torch.Size([1, 3, 12, 2160])
torch.Size([1, 3, 12, 2160])
torch.Size([1, 4, 12, 2160])
torch.Size([1, 6, 12, 2160])
torch.Size([1, 4, 12, 2160])
torch.Size([1, 6, 12, 2160])
torch.Size([1, 7, 12, 2160])
