In [1]:
import os 
import numpy as np 
import pandas as pd 
from glob import glob
import random

from torch.utils.data import Dataset, DataLoader
import torch

In [2]:
class OPPReader(object):
    def __init__(self, root_path):
        self.root_path = root_path
        self.readOPP()

    def readFile(self, file_path, active_cols):
        all_data = {"data": {}, "target": {}, 'collection': []}
        prev_action = -1
        starting = True
        # action_seq = []
        accepted_actions = [406516, 406517, 404516, 404517, 406520, 404520, 406505, 404505, 406519, 404519, 406511, 404511, 406508, 404508, 408512, 407521, 405506]
        action_ID = 0
        cols = list(range(37,133))
        for l in open(file_path).readlines():
            s = l.rstrip('\n').split(' ')
            act = int(s[-1])
            if act in accepted_actions:
                if (prev_action != act):
                    if not(starting):
                        df = pd.DataFrame(action_seq)
                        intep_df = df.interpolate(method='linear', limit_direction='both', axis=0)
                        intep_data = intep_df.values[:, cols]
                        # print(intep_data[:, 1:])
                        k = np.isnan(intep_data[:, :]).mean()
                        if k == 0:
                            # intep_data = action_seq
                            all_data['data'][action_ID] = np.array(intep_data)
                            # print(all_data['data'][action_ID].shape)
                            all_data['target'][action_ID] = prev_action
                            action_ID+=1
                    action_seq = []
                else:
                    starting = False

                data_seq = np.array(s[:-1]).astype(np.float16)                # data_seq[np.isnan(data_seq)] = 0
                action_seq.append(data_seq)
                prev_action = act
                
                # print(prev_action)
                all_data['collection'].append(data_seq)
        else: 
            if len(action_seq) > 1:
                df = pd.DataFrame(action_seq)
                intep_df = df.interpolate(method='linear', limit_direction='backward', axis=0)
                intep_data = intep_df.values[:, cols]
                # print(np.isnan(intep_data[:, :]).mean(axis=0))
                if np.isnan(intep_data).sum() == 0:
                    all_data['data'][action_ID] = np.array(intep_data)
                    all_data['target'][action_ID] = prev_action
        return all_data

    def readOPPFiles(self, filelist, labelToId):
        data = []
        labels = []
        collection = []
        accepted_cols = list(range(1, 101))
        for i, fpath in enumerate(filelist):
            print('Reading file %d of %d' % (i+1, len(filelist)))
            # fpath = os.path.join(self.root_path, filename)
            file_data = self.readFile(fpath, accepted_cols)
            # print(np.array(list(file_data['data'].values())).shape)
            data.extend(list(file_data['data'].values()))
            labels.extend(list(file_data['target'].values()))
            collection.extend(file_data['collection'])
        return np.asarray(data), np.asarray(labels, dtype=int), np.array(collection)

    def readOPP(self):
        files = []
        for p in glob(f'{self.root_path}/S*-ADL*.dat'):
            files.append(p)
            # files = [f'dataset_{i}.txt' for i in range(1, 20)]
            
        label_map = [
            (406516, 'Open Door 1'),
            (406517, 'Open Door 2'),
            (404516, 'Close Door 1'),
            (404517, 'Close Door 2'),
            (406520, 'Open Fridge'),
            (404520, 'Close Fridge'),
            (406505, 'Open Dishwasher'),
            (404505, 'Close Dishwasher'),
            (406519, 'Open Drawer 1'),
            (404519, 'Close Drawer 1'),
            (406511, 'Open Drawer 2'),
            (404511, 'Close Drawer 2'),
            (406508, 'Open Drawer 3'),
            (404508, 'Close Drawer 3'),
            (408512, 'Clean Table'),
            (407521, 'Drink from Cup'),
            (405506, 'Toggle Switch'),
        ]
        labelToId = {x[0]: i for i, x in enumerate(label_map)}
        # print "label2id=",labelToId
        idToLabel = [x[1] for x in label_map]
        # print "id2label=",idToLabel
        # print "cols",cols
        self.data, self.targets, self.all_data = self.readOPPFiles(files, labelToId)
        # print(self.data)
        # nan_perc = np.isnan(self.data).astype(int).mean()
        # print("null value percentage ", nan_perc)
        # f = lambda x: labelToId[x]
        self.targets = np.array([labelToId[i] for i in list(self.targets)])
        self.label_map = label_map
        self.idToLabel = idToLabel
        # return data, idToLabel

    def dataTableOptimizerUpdated(self, mat_file):
        our_data = mat_file['d_iner']
        data = []
        frame_size = len(our_data[0][0])-1
        for each in range(0,frame_size):
            data_flatten = our_data[:,:,each].flatten()
            data_flatten = data_flatten
            data.append(data_flatten)
        return data,frame_size

    def resample(self, signal, freq=10):
        step_size = int(30/freq)
        seq_len, _ = signal.shape 
        resample_indx = np.arange(0, seq_len, step_size)
        resampled_sig = signal[resample_indx, :]
        return resampled_sig

    def windowing(self, signal, window_len, overlap):
        seq_len = int(window_len*30) # 30Hz compensation 
        overlap_len = int(overlap*30) # 30Hz
        l, _ = signal.shape
        if l > seq_len:
            windowing_points = np.arange(start=0, stop=l-seq_len, step=seq_len-overlap_len, dtype=int)[:-1]

            windows = [signal[p:p+seq_len, :] for p in windowing_points]
            # print(np.array(windows).shape)
        else:
            windows = []
        return windows

    def resampling(self, data, targets, window_size, window_overlap, resample_freq):
        assert len(data) == len(targets), "# action data & # action labels are not matching"
        all_data, all_ids, all_labels = [], [], []
        for i, d in enumerate(data):
            # print(">>>>>>>>>>>>>>>  ", np.isnan(d).mean())
            label = targets[i]
            windows = self.windowing(d, window_size, window_overlap)
            for w in windows:
                # print(np.isnan(w).mean(), label, i)
                resample_sig = w#self.resample(w, resample_freq)
                # print(np.isnan(resample_sig).mean(), label, i)
                # print(resample_sig.shape)
                all_data.append(resample_sig)
                all_ids.append(i+1)
                all_labels.append(label)

        return all_data, all_ids, all_labels

    def generate(self, unseen_classes, window_size=5.21, window_overlap=1, resample_freq=20, seen_ratio=0.2, unseen_ratio=0.8):
        # assert all([i in list(self.label_map.keys()) for i in unseen_classes]), "Unknown Class label!"
        seen_classes = [i for i in range(len(self.idToLabel)) if i not in unseen_classes]
        unseen_mask = np.in1d(self.targets, unseen_classes)
        
        # build seen dataset 
        seen_data = self.data[np.invert(unseen_mask)]
        seen_targets = self.targets[np.invert(unseen_mask)]
        print(f"data shape : {self.data.shape}, seen_data shape : {seen_data.shape}")
        ids, cnts = np.unique(self.targets, return_counts=True)
        print({self.idToLabel[ids[e]]: cnts[e] for e in range(len(ids))})
        
        # build unseen dataset
        unseen_data = self.data[unseen_mask]
        unseen_targets = self.targets[unseen_mask]

        # # resampling seen and unseen datasets 
        seen_data, seen_ids, seen_targets = self.resampling(seen_data, seen_targets, window_size, window_overlap, resample_freq)
        unseen_data, unseen_ids, unseen_targets = self.resampling(unseen_data, unseen_targets, window_size, window_overlap, resample_freq)

        seen_data, seen_targets = np.array(seen_data), np.array(seen_targets)
        unseen_data, unseen_targets = np.array(unseen_data), np.array(unseen_targets)
       # train-val split
        seen_index = list(range(len(seen_targets)))
        random.shuffle(seen_index)
        split_point = int((1-seen_ratio)*len(seen_index))
        fst_index, sec_index = seen_index[:split_point], seen_index[split_point:]
        # print(fst_index)
        # print(type(fst_index), type(sec_index), type(seen_data), type(seen_targets))
        X_seen_train, X_seen_val = seen_data[fst_index, :], seen_data[sec_index, :]
        y_seen_train, y_seen_val = seen_targets[fst_index], seen_targets[sec_index]
        
    
        data = {'train': {
                        'X': X_seen_train,
                        'y': y_seen_train
                        },
                'eval-seen':{
                        'X': X_seen_val,
                        'y': y_seen_val
                        },
                'test': {
                        'X': unseen_data,
                        'y': unseen_targets
                        },
                'seen_classes': seen_classes,
                'unseen_classes': unseen_classes
                }

        return data

In [3]:
dataReader = OPPReader(root_path='../data/OPP/')

Reading file 1 of 20
Reading file 2 of 20
Reading file 3 of 20
Reading file 4 of 20
Reading file 5 of 20
Reading file 6 of 20
Reading file 7 of 20
Reading file 8 of 20
Reading file 9 of 20
Reading file 10 of 20
Reading file 11 of 20
Reading file 12 of 20
Reading file 13 of 20
Reading file 14 of 20
Reading file 15 of 20
Reading file 16 of 20
Reading file 17 of 20
Reading file 18 of 20
Reading file 19 of 20
Reading file 20 of 20


  return np.asarray(data), np.asarray(labels, dtype=int), np.array(collection)


In [4]:
data_dict = dataReader.generate(unseen_classes=[2, 5, 7], window_size=1, window_overlap=0.5, resample_freq=30)

data shape : (978,), seen_data shape : (756,)
{'Open Door 1': 45, 'Open Door 2': 43, 'Close Door 1': 38, 'Close Door 2': 40, 'Open Fridge': 129, 'Close Fridge': 131, 'Open Dishwasher': 54, 'Close Dishwasher': 53, 'Open Drawer 1': 50, 'Close Drawer 1': 49, 'Open Drawer 2': 45, 'Close Drawer 2': 44, 'Open Drawer 3': 56, 'Close Drawer 3': 57, 'Clean Table': 17, 'Drink from Cup': 56, 'Toggle Switch': 71}


In [5]:
np.isnan(data_dict['train']['X']).mean()

0.0

In [6]:
data_dict['train']['X'].shape

(3118, 30, 96)

In [134]:
dataReader.idToLabel

['Open Door 1',
 'Open Door 2',
 'Close Door 1',
 'Close Door 2',
 'Open Fridge',
 'Close Fridge',
 'Open Dishwasher',
 'Close Dishwasher',
 'Open Drawer 1',
 'Close Drawer 1',
 'Open Drawer 2',
 'Close Drawer 2',
 'Open Drawer 3',
 'Close Drawer 3',
 'Clean Table',
 'Drink from Cup',
 'Toggle Switch']

In [146]:
class OPPDataset(Dataset):
    def __init__(self, data, actions, action_classes, seq_len=120):
        super(OPPDataset, self).__init__()
        cols = list(range(4, 9))+list(range(16,18))+list(range(22,34))+list(range(37,133))
        self.data = torch.from_numpy(data)[:, :, cols] # get only subject related IMU features
        self.actions = actions
        # self.attributes = torch.from_numpy(attributes)
        # self.action_feats = torch.from_numpy(action_feats)
        # self.target_feat = torch.from_numpy(action_feats[action_classes, :])
        self.seq_len = seq_len
        # build action to id mapping dict
        self.n_action = len(self.actions)
        self.action2Id = dict(zip(action_classes, range(self.n_action)))

    def __getitem__(self, ind):
        x = self.data[ind, ...]
        x_mask = np.array([0]) #self.padding_mask[ind, ...]
        target = self.actions[ind]
        y = torch.from_numpy(np.array([self.action2Id[target]]))
        # y_feat = self.action_feats[target, ...]
        # attr = self.attributes[target, ...]
        return x, y

    def __len__(self):
        return self.data.shape[0]

In [None]:
data_dict = dataReader.generate(unseen_classes=[2, 5, 7], seen_ratio=0.2, unseen_ratio=0.8, window_size=1, window_overlap=0.5, resample_freq=30)
sample_dt = OPPDataset(data=data_dict['train']['X'], actions=data_dict['train']['y'], action_classes=data_dict['seen_classes'], seq_len=100)
sample_dl = DataLoader(sample_dt, batch_size=32, shuffle=True, pin_memory=True, drop_last=True)
for d in sample_dl:
    print(d[0].shape, torch.isnan(d[0]).numpy().mean(axis=0).mean(axis=0))
    

In [158]:
np.isnan(data_dict['train']['X']).mean()

0.02675010550247693

In [26]:
def sum_dict(d1, d2):
    return {k: d1[k]+d2[k] for k in d1.keys()}

In [7]:
# training dataset
train_X, train_y = data_dict['train']['X'], data_dict['train']['y']
print("number of training samples : ", len(train_y))
s = np.unique(train_y, return_counts=True)
std = dict(zip(s[0], s[1]))
print("per class count : ", std)

number of training samples :  3118
per class count :  {0: 238, 1: 262, 3: 192, 4: 350, 6: 123, 8: 65, 9: 53, 10: 58, 11: 40, 12: 90, 13: 79, 14: 272, 15: 1191, 16: 105}


In [8]:
# Seen Evaluation dataset
Seval_X, Seval_y = data_dict['eval-seen']['X'], data_dict['eval-seen']['y']
print("number of training samples : ", len(Seval_y))
s = np.unique(Seval_y, return_counts=True)
sed = dict(zip(s[0], s[1]))
print("per class count : ", sed)

number of training samples :  780
per class count :  {0: 59, 1: 54, 3: 47, 4: 87, 6: 40, 8: 14, 9: 7, 10: 13, 11: 10, 12: 28, 13: 26, 14: 80, 15: 299, 16: 16}


In [9]:
# Unseen Eval dataset
Ueval_X, Ueval_y = data_dict['eval-unseen']['X'], data_dict['eval-unseen']['y']
print("number of training samples : ", len(Ueval_y))
s = np.unique(Ueval_y, return_counts=True)
utd = dict(zip(s[0], s[1]))
print("per class count : ", utd)

KeyError: 'eval-unseen'

In [10]:
# Unseen Eval dataset
test_X, test_y = data_dict['test']['X'], data_dict['test']['y']
print("number of training samples : ", len(test_y))
s = np.unique(test_y, return_counts=True)
ued = dict(zip(s[0], s[1]))
print("per class count : ", ued)

number of training samples :  742
per class count :  {2: 235, 5: 360, 7: 147}


In [30]:
sum_dict(utd, ued)

NameError: name 'utd' is not defined

In [13]:
all_labels = np.concatenate([train_y, Seval_y, test_y])
all_labels.shape

(4640,)

In [14]:
np.unique(all_labels, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]),
 array([ 297,  316,  235,  239,  437,  360,  163,  147,   79,   60,   71,
          50,  118,  105,  352, 1490,  121], dtype=int64))

In [19]:
len(data_dict['test']['X'])

143

In [11]:
print("Total number of data points : ", len(test_y)+len(Seval_y)+len(train_y))
print("Total number of unseen data : ", len(test_y))
print("Total number of seen data : ", len(Seval_y)+len(train_y))

Total number of data points :  4640
Total number of unseen data :  742
Total number of seen data :  3898


In [21]:
all_classes = dataReader.idToLabel
seen_classes = data_dict['seen_classes']
unseen_classes = data_dict['unseen_classes']

In [12]:
data_dict['train']['X'].shape

(3118, 30, 96)

In [19]:
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch

In [24]:
class DaLiAcDataset(Dataset):
    def __init__(self, data, actions, attributes, action_feats, action_classes, seq_len=120):
        super(DaLiAcDataset, self).__init__()
        self.data = torch.from_numpy(data)
        self.actions = actions
        self.attributes = torch.from_numpy(attributes)
        self.action_feats = torch.from_numpy(action_feats)
        self.target_feat = torch.from_numpy(action_feats[action_classes, :])
        self.seq_len = seq_len
        # build action to id mapping dict
        self.n_action = len(self.actions)
        self.action2Id = dict(zip(action_classes, range(self.n_action)))

    def __getitem__(self, ind):
        x = self.data[ind, ...]
        x_mask = np.array([0]) #self.padding_mask[ind, ...]
        target = self.actions[ind]
        y = torch.from_numpy(np.array([self.action2Id[target]]))
        y_feat = self.action_feats[target, ...]
        attr = self.attributes[target, ...]
        return x, y, y_feat, attr, x_mask

    def __len__(self):
        return len(self.data)

In [25]:
all_classes = dataReader.idToLabel
seen_classes = data_dict['seen_classes']
unseen_classes = data_dict['unseen_classes']

In [26]:
attr_mat = np.zeros((13, 32))
feat_mat = np.zeros((13, 42))
train_dt = DaLiAcDataset(data=data_dict['train']['X'], actions=data_dict['train']['y'], attributes=attr_mat, action_feats=feat_mat, action_classes=seen_classes, seq_len=120)
train_dl = DataLoader(train_dt, batch_size=32, shuffle=True, pin_memory=True)

In [27]:
for b in train_dl:
    x, y, yf, attr, xm = b
    print(x.shape)
    break

torch.Size([32, 130, 24])


In [None]:
# get IMU feature columns in OPP 
