In [1]:
import os 
import numpy as np 
import pandas as pd 
from glob import glob
import random

In [2]:
with open('../data/DaLiAc_Dataset/dataset_1.txt', 'r') as pf:
    all_data = [l.rstrip('\n').split(',') for l in pf.readlines()]


In [3]:
np_data = np.array(all_data)
all_labels = np_data[:, -1]
np.unique(all_labels, return_counts=True)

(array(['1', '10', '11', '12', '13', '2', '3', '4', '5', '6', '7', '8',
        '9'], dtype='<U11'),
 array([12290, 25397, 24577, 24577, 12907, 12289, 12289, 24578, 12289,
        18433, 52021,  6964,  6965], dtype=int64))

In [4]:
[
    (1, 'sitting'),
    (2, 'lying'),
    (3, 'standing'),
    (4, 'washing dishes'),
    (5, 'vacuuming'),
    (6, 'sweeping'),
    (7, 'walking'),
    (8, 'ascending stairs'),
    (9, 'descending stairs'),
    (10, 'treadmill running'),
    (11, '50W cycling'),
    (12, '100W cycling'),
    (13, 'rope jumping')
]

[(1, 'sitting'),
 (2, 'lying'),
 (3, 'standing'),
 (4, 'washing dishes'),
 (5, 'vacuuming'),
 (6, 'sweeping'),
 (7, 'walking'),
 (8, 'ascending stairs'),
 (9, 'descending stairs'),
 (10, 'treadmill running'),
 (11, '50W cycling'),
 (12, '100W cycling'),
 (13, 'rope jumping')]

In [2]:
class DaLiAcReader(object):
    def __init__(self, root_path):
        self.root_path = root_path
        self.readDaliac()

    def readFile(self, file_path):
        all_data = {"data": {}, "target": {}, 'collection': []}
        prev_action = -1
        starting = True
        # action_seq = []
        action_ID = 0

        for l in open(file_path).readlines():
            s = l.rstrip('\n').split(',')
            act = int(s[-1])
            if act == 12:
                act = 11
            elif act == 13:
                act = 12

            if (prev_action != act):
                if not(starting):
                    # df = pd.DataFrame(action_seq)
                    # intep_df = df.interpolate(method='linear', limit_direction='backward', axis=0)
                    # intep_data = intep_df.values 
                    intep_data = action_seq
                    all_data['data'][action_ID] = np.array(intep_data)
                    all_data['target'][action_ID] = prev_action
                    action_ID+=1
                action_seq = []
            else:
                starting = False
            data_seq = np.array(s[:-1]).astype(np.float16)
            # data_seq[np.isnan(data_seq)] = 0
            action_seq.append(data_seq)
            prev_action = act
            
            # print(prev_action)
            all_data['collection'].append(data_seq)
        else: 
            if len(action_seq) > 1:
                df = pd.DataFrame(action_seq)
                intep_df = df.interpolate(method='linear', limit_direction='backward', axis=0)
                intep_data = intep_df.values
                all_data['data'][action_ID] = np.array(intep_data)
                all_data['target'][action_ID] = prev_action
        return all_data

    def readDaliAcFiles(self, filelist, labelToId):
        data = []
        labels = []
        collection = []
        for i, filename in enumerate(filelist):
            print('Reading file %d of %d' % (i+1, len(filelist)))
            fpath = os.path.join(self.root_path, filename)
            file_data = self.readFile(fpath)
            data.extend(list(file_data['data'].values()))
            labels.extend(list(file_data['target'].values()))
            collection.extend(file_data['collection'])
        return np.asarray(data), np.asarray(labels, dtype=int), np.array(collection)

    def readDaliac(self):
        files = [f'dataset_{i}.txt' for i in range(1, 20)]
            
        label_map = [
            (1, 'sitting'),
            (2, 'lying'),
            (3, 'standing'),
            (4, 'washing dishes'),
            (5, 'vacuuming'),
            (6, 'sweeping'),
            (7, 'walking'),
            (8, 'ascending stairs'),
            (9, 'descending stairs'),
            (10, 'treadmill running'),
            (11, 'cycling'),
            (12, 'rope jumping')
        ]
        labelToId = {x[0]: i for i, x in enumerate(label_map)}
        # print "label2id=",labelToId
        idToLabel = [x[1] for x in label_map]
        # print "id2label=",idToLabel
        cols = [
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                ]
        # print "cols",cols
        self.data, self.targets, self.all_data = self.readDaliAcFiles(files, labelToId)
        # print(self.data)
        # nan_perc = np.isnan(self.data).astype(int).mean()
        # print("null value percentage ", nan_perc)
        # f = lambda x: labelToId[x]
        self.targets = np.array([labelToId[i] for i in list(self.targets)])
        self.label_map = label_map
        self.idToLabel = idToLabel
        # return data, idToLabel

    def dataTableOptimizerUpdated(self, mat_file):
        our_data = mat_file['d_iner']
        data = []
        frame_size = len(our_data[0][0])-1
        for each in range(0,frame_size):
            data_flatten = our_data[:,:,each].flatten()
            data_flatten = data_flatten
            data.append(data_flatten)
        return data,frame_size

    def resample(self, signal, freq=10):
        step_size = int(100/freq)
        seq_len, _ = signal.shape 
        resample_indx = np.arange(0, seq_len, step_size)
        resampled_sig = signal[resample_indx, :]
        return resampled_sig

    def windowing(self, signal, window_len, overlap):
        seq_len = int(window_len*50) # 100Hz compensation 
        overlap_len = int(overlap*50) # 100Hz
        l, _ = signal.shape
        if l > seq_len:
            windowing_points = np.arange(start=0, stop=l-seq_len, step=seq_len-overlap_len, dtype=int)[:-1]

            windows = [signal[p:p+seq_len, :] for p in windowing_points]
        else:
            windows = []
        return windows

    def resampling(self, data, targets, window_size, window_overlap, resample_freq):
        assert len(data) == len(targets), "# action data & # action labels are not matching"
        all_data, all_ids, all_labels = [], [], []
        for i, d in enumerate(data):
            # print(">>>>>>>>>>>>>>>  ", np.isnan(d).mean())
            label = targets[i]
            windows = self.windowing(d, window_size, window_overlap)
            for w in windows:
                # print(np.isnan(w).mean(), label, i)
                resample_sig = self.resample(w, resample_freq)
                # print(np.isnan(resample_sig).mean(), label, i)
                all_data.append(resample_sig)
                all_ids.append(i+1)
                all_labels.append(label)

        return all_data, all_ids, all_labels

    def generate(self, unseen_classes, window_size=5.21, window_overlap=1, resample_freq=20, seen_ratio=0.2, unseen_ratio=0.8):
        # assert all([i in list(self.label_map.keys()) for i in unseen_classes]), "Unknown Class label!"
        seen_classes = [i for i in range(len(self.idToLabel)) if i not in unseen_classes]
        unseen_mask = np.in1d(self.targets, unseen_classes)
        
        # build seen dataset 
        seen_data = self.data[np.invert(unseen_mask)]
        seen_targets = self.targets[np.invert(unseen_mask)]
        print(f"data shape : {self.data.shape}, seen_data shape : {seen_data.shape}")
        ids, cnts = np.unique(self.targets, return_counts=True)
        print({self.idToLabel[ids[e]]: cnts[e] for e in range(len(ids))})
        
        # build unseen dataset
        unseen_data = self.data[unseen_mask]
        unseen_targets = self.targets[unseen_mask]

        # # resampling seen and unseen datasets 
        seen_data, seen_ids, seen_targets = self.resampling(seen_data, seen_targets, window_size, window_overlap, resample_freq)
        unseen_data, unseen_ids, unseen_targets = self.resampling(unseen_data, unseen_targets, window_size, window_overlap, resample_freq)

        seen_data, seen_targets = np.array(seen_data), np.array(seen_targets)
        unseen_data, unseen_targets = np.array(unseen_data), np.array(unseen_targets)
       # train-val split
        seen_index = list(range(len(seen_targets)))
        random.shuffle(seen_index)
        split_point = int((1-seen_ratio)*len(seen_index))
        fst_index, sec_index = seen_index[:split_point], seen_index[split_point:]
        # print(fst_index)
        # print(type(fst_index), type(sec_index), type(seen_data), type(seen_targets))
        X_seen_train, X_seen_val = seen_data[fst_index, :], seen_data[sec_index, :]
        y_seen_train, y_seen_val = seen_targets[fst_index], seen_targets[sec_index]
        
        # val-test split
        unseen_index = list(range(len(unseen_targets)))
        random.shuffle(unseen_index)
        split_point = int((1-unseen_ratio)*len(unseen_index))
        fst_index, sec_index = unseen_index[:split_point], unseen_index[split_point:]

        X_unseen_val, X_unseen_test = unseen_data[fst_index, :], unseen_data[sec_index, :]
        y_unseen_val, y_unseen_test = unseen_targets[fst_index], unseen_targets[sec_index]

        data = {'train': {
                        'X': X_seen_train,
                        'y': y_seen_train
                        },
                'eval-seen':{
                        'X': X_seen_val,
                        'y': y_seen_val
                        },
                'eval-unseen':{
                        'X': X_unseen_val,
                        'y': y_unseen_val
                        },
                'test': {
                        'X': X_unseen_test,
                        'y': y_unseen_test
                        },
                'seen_classes': seen_classes,
                'unseen_classes': unseen_classes
                }

        return data

In [3]:
dataReader = DaLiAcReader(root_path='../data/DaLiAc_Dataset/')

Reading file 1 of 19
Reading file 2 of 19
Reading file 3 of 19
Reading file 4 of 19
Reading file 5 of 19
Reading file 6 of 19
Reading file 7 of 19
Reading file 8 of 19
Reading file 9 of 19
Reading file 10 of 19
Reading file 11 of 19
Reading file 12 of 19
Reading file 13 of 19
Reading file 14 of 19
Reading file 15 of 19
Reading file 16 of 19
Reading file 17 of 19
Reading file 18 of 19
Reading file 19 of 19


  return np.asarray(data), np.asarray(labels, dtype=int), np.array(collection)


In [4]:
data_dict = dataReader.generate(unseen_classes=[1, 4, 7], window_size=5.21, window_overlap=1, resample_freq=50)

data shape : (247,), seen_data shape : (190,)
{'sitting': 19, 'lying': 19, 'standing': 19, 'washing dishes': 19, 'vacuuming': 19, 'sweeping': 19, 'walking': 38, 'ascending stairs': 19, 'descending stairs': 19, 'treadmill running': 19, 'cycling': 19, 'rope jumping': 19}


In [5]:
dataReader.idToLabel

['sitting',
 'lying',
 'standing',
 'washing dishes',
 'vacuuming',
 'sweeping',
 'walking',
 'ascending stairs',
 'descending stairs',
 'treadmill running',
 'cycling',
 'rope jumping']

In [10]:
def sum_dict(d1, d2):
    return {k: d1[k]+d2[k] for k in d1.keys()}

In [11]:
# training dataset
train_X, train_y = data_dict['train']['X'], data_dict['train']['y']
print("number of training samples : ", len(train_y))
s = np.unique(train_y, return_counts=True)
std = dict(zip(s[0], s[1]))
print("per class count : ", std)

number of training samples :  15177
per class count :  {0: 848, 2: 846, 3: 1798, 5: 1422, 6: 3953, 8: 509, 9: 1761, 10: 3550, 11: 490}


In [12]:
# Seen Evaluation dataset
Seval_X, Seval_y = data_dict['eval-seen']['X'], data_dict['eval-seen']['y']
print("number of training samples : ", len(Seval_y))
s = np.unique(Seval_y, return_counts=True)
sed = dict(zip(s[0], s[1]))
print("per class count : ", sed)

number of training samples :  3795
per class count :  {0: 232, 2: 228, 3: 450, 5: 351, 6: 965, 8: 120, 9: 421, 10: 910, 11: 118}


In [13]:
sum_dict(std, sed)

{0: 1080,
 2: 1074,
 3: 2248,
 5: 1773,
 6: 4918,
 8: 629,
 9: 2182,
 10: 4460,
 11: 608}

In [14]:
# Unseen Eval dataset
Ueval_X, Ueval_y = data_dict['eval-unseen']['X'], data_dict['eval-unseen']['y']
print("number of training samples : ", len(Ueval_y))
s = np.unique(Ueval_y, return_counts=True)
utd = dict(zip(s[0], s[1]))
print("per class count : ", utd)

number of training samples :  581
per class count :  {1: 215, 4: 221, 7: 145}


In [15]:
# Unseen Eval dataset
test_X, test_y = data_dict['test']['X'], data_dict['test']['y']
print("number of training samples : ", len(test_y))
s = np.unique(test_y, return_counts=True)
ued = dict(zip(s[0], s[1]))
print("per class count : ", ued)

number of training samples :  2329
per class count :  {1: 871, 4: 863, 7: 595}


In [16]:
sum_dict(utd, ued)

{1: 1086, 4: 1084, 7: 740}

In [15]:
len(data_dict['test']['X'])

2329

In [16]:
print("Total number of data points : ", len(test_y)+len(Ueval_y)+len(Seval_y)+len(train_y))
print("Total number of unseen data : ", len(test_y)+len(Ueval_y))
print("Total number of seen data : ", len(Seval_y)+len(train_y))

Total number of data points :  21844
Total number of unseen data :  2910
Total number of seen data :  18934


In [17]:
all_classes = dataReader.idToLabel
seen_classes = data_dict['seen_classes']
unseen_classes = data_dict['unseen_classes']

In [18]:
data_dict['train']['X'].shape

(15147, 130, 24)

In [19]:
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch

In [24]:
class DaLiAcDataset(Dataset):
    def __init__(self, data, actions, attributes, action_feats, action_classes, seq_len=120):
        super(DaLiAcDataset, self).__init__()
        self.data = torch.from_numpy(data)
        self.actions = actions
        self.attributes = torch.from_numpy(attributes)
        self.action_feats = torch.from_numpy(action_feats)
        self.target_feat = torch.from_numpy(action_feats[action_classes, :])
        self.seq_len = seq_len
        # build action to id mapping dict
        self.n_action = len(self.actions)
        self.action2Id = dict(zip(action_classes, range(self.n_action)))

    def __getitem__(self, ind):
        x = self.data[ind, ...]
        x_mask = np.array([0]) #self.padding_mask[ind, ...]
        target = self.actions[ind]
        y = torch.from_numpy(np.array([self.action2Id[target]]))
        y_feat = self.action_feats[target, ...]
        attr = self.attributes[target, ...]
        return x, y, y_feat, attr, x_mask

    def __len__(self):
        return len(self.data)

In [25]:
all_classes = dataReader.idToLabel
seen_classes = data_dict['seen_classes']
unseen_classes = data_dict['unseen_classes']

In [26]:
attr_mat = np.zeros((13, 32))
feat_mat = np.zeros((13, 42))
train_dt = DaLiAcDataset(data=data_dict['train']['X'], actions=data_dict['train']['y'], attributes=attr_mat, action_feats=feat_mat, action_classes=seen_classes, seq_len=120)
train_dl = DataLoader(train_dt, batch_size=32, shuffle=True, pin_memory=True)

In [27]:
for b in train_dl:
    x, y, yf, attr, xm = b
    print(x.shape)
    break

torch.Size([32, 130, 24])
