In [1]:
import os
import json
import random 
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, Tensor
from torch.nn import functional as F 
from torch.optim import Adam

from scipy.signal import resample
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt

In [14]:
# build PAMAP2 dataset data reader
class PAMAP2Reader(object):
    def __init__(self, root_path):
        self.root_path = root_path
        self.readPamap2()

    def readFile(self, file_path, cols):
        all_data = {"data": {}, "target": {}, 'collection': []}
        prev_action = -1
        starting = True
        # action_seq = []
        action_ID = 0

        for l in open(file_path).readlines():
            s = l.strip().split()
            if s[1] != "0":
                if (prev_action != int(s[1])):
                    if not(starting):
                        df = pd.DataFrame(action_seq)
                        intep_df = df.interpolate(method='linear', limit_direction='both', axis=0)
                        intep_data = intep_df.values 
                        all_data['data'][action_ID] = np.array(intep_data)
                        all_data['target'][action_ID] = prev_action
                        action_ID+=1
                    action_seq = []
                else:
                    starting = False
                data_seq = np.array(s[3:])[cols].astype(np.float16)
                # data_seq[np.isnan(data_seq)] = 0
                action_seq.append(data_seq)
                prev_action = int(s[1])
                # print(prev_action)
                all_data['collection'].append(data_seq)
        else: 
            if len(action_seq) > 1:
                df = pd.DataFrame(action_seq)
                intep_df = df.interpolate(method='linear', limit_direction='both', axis=0)
                intep_data = intep_df.values
                all_data['data'][action_ID] = np.array(intep_data)
                all_data['target'][action_ID] = prev_action
        return all_data

    def readPamap2Files(self, filelist, cols, labelToId):
        data = []
        labels = []
        collection = []
        for i, filename in enumerate(filelist):
            print('Reading file %d of %d' % (i+1, len(filelist)))
            fpath = os.path.join(self.root_path, filename)
            file_data = self.readFile(fpath, cols)
            data.extend(list(file_data['data'].values()))
            labels.extend(list(file_data['target'].values()))
            collection.extend(file_data['collection'])
        return np.asarray(data), np.asarray(labels, dtype=int), np.array(collection)

    def readPamap2(self):
        files = ['subject101.dat', 'subject102.dat','subject103.dat','subject104.dat', 'subject105.dat', 'subject106.dat', 'subject107.dat', 'subject108.dat', 'subject109.dat', 'subject110.dat', 'subject111.dat', 'subject112.dat', 'subject113.dat', 'subject114.dat']
            
        label_map = [
            (0, 'other'),
            (1, 'lying'),
            (2, 'sitting'),
            (3, 'standing'),
            (4, 'walking'),
            (5, 'running'),
            (6, 'cycling'),
            (7, 'Nordic walking'),
            (9, 'watching TV'),
            (10, 'computer work'),
            (11, 'car driving'),
            (12, 'ascending stairs'),
            (13, 'descending stairs'),
            (16, 'vacuum cleaning'),
            (17, 'ironing'),
            (18, 'folding laundry'),
            (19, 'house cleaning'),
            (20, 'playing soccer'),
            (24, 'rope jumping')
        ]
        labelToId = {x[0]: i for i, x in enumerate(label_map)}
        # print "label2id=",labelToId
        idToLabel = [x[1] for x in label_map]
        # print "id2label=",idToLabel
        cols = [1,2,3,7,8,9,17,18,19,23,24,25,33,34,35,39,40,41]
        # print "cols",cols
        self.data, self.targets, self.all_data = self.readPamap2Files(files, cols, labelToId)
        # print(self.data)
        # nan_perc = np.isnan(self.data).astype(int).mean()
        # print("null value percentage ", nan_perc)
        # f = lambda x: labelToId[x]
        print(np.unique(self.targets))
        self.targets = np.array([labelToId[i] for i in list(self.targets)])
        print(np.unique(self.targets))
        self.label_map = label_map
        self.idToLabel = idToLabel
        # return data, idToLabel

    def aggregate(self, signal):
        # print(signal.min(), signal.max())
        means = signal.astype(np.float64).mean(axis=0)
        stds = signal.astype(np.float64).std(axis=0)
        if np.isinf(stds).sum() > 0:
            # print(stds, signal)
            pass
        mergered = np.vstack((means,stds)).reshape((-1,),order='F')
        # print(signal.shape, means.shape, stds.shape, mergered.shape)
        return mergered

    def windowing(self, signal, window_len, overlap):
        seq_len = int(window_len*100) # 100Hz compensation 
        overlap_len = int(overlap*100) # 100Hz
        l, _ = signal.shape
        if l > seq_len:
            windowing_points = np.arange(start=0, stop=l-seq_len, step=seq_len-overlap_len, dtype=int)[:-1]
            # windowing_points = windowing_points-overlap_len
            # windowing_points[0] = 0 

            windows = [signal[p:p+seq_len, :] for p in windowing_points]
        else:
            windows = []
        return windows

    def resampling(self, data, targets, window_size, window_overlap, resample_freq):
        assert len(data) == len(targets), "# action data & # action labels are not matching"
        all_data, all_ids, all_labels = [], [], []
        for i, d in enumerate(data):
            # print(">>>>>>>>>>>>>>>  ", np.isnan(d).mean())
            label = targets[i]
            windows = self.windowing(d, window_size, window_overlap)
            for w in windows:
                # print(np.isnan(w).mean(), label, i)
                resample_sig = self.aggregate(w)
                # print(np.isnan(resample_sig).mean(), label, i)
                all_data.append(resample_sig)
                all_ids.append(i+1)
                all_labels.append(label)

        return all_data, all_ids, all_labels

    def generate(self, unseen_classes, resampling=True, window_size=5.21, window_overlap=1, resample_freq=10, seen_ratio=0.2, unseen_ratio=0.8):
        # assert all([i in list(self.label_map.keys()) for i in unseen_classes]), "Unknown Class label!"
        seen_classes = [i for i in range(len(self.idToLabel)) if i not in unseen_classes]
        unseen_mask = np.in1d(self.targets, unseen_classes)

        s = np.unique(self.targets, return_counts=True)
        print("per class count : ", dict(zip([self.idToLabel[i] for i in s[0]], s[1])))

        # build seen dataset 
        seen_data = self.data[np.invert(unseen_mask)]
        seen_targets = self.targets[np.invert(unseen_mask)]

        # print('>>>>  ', seen_data.min(), seen_data.max())

        # build unseen dataset
        unseen_data = self.data[unseen_mask]
        unseen_targets = self.targets[unseen_mask]

        # resampling seen and unseen datasets 
        seen_data, seen_ids, seen_targets = self.resampling(seen_data, seen_targets, window_size, window_overlap, resample_freq)
        unseen_data, unseen_ids, unseen_targets = self.resampling(unseen_data, unseen_targets, window_size, window_overlap, resample_freq)

        seen_data, seen_targets = np.array(seen_data), np.array(seen_targets)
        unseen_data, unseen_targets = np.array(unseen_data), np.array(unseen_targets)
        
        # pos_thr = 10e4
        # pos_mask = seen_data<pos_thr
        # seen_data = seen_data[pos_mask]
        # seen_targets = seen_targets[pos_mask]
        print('>> ', seen_data.min(), seen_data.max())
        # train-val split
        seen_index = list(range(len(seen_targets)))
        random.shuffle(seen_index)
        split_point = int((1-seen_ratio)*len(seen_index))
        fst_index, sec_index = seen_index[:split_point], seen_index[split_point:]
        print(type(fst_index), type(sec_index), type(seen_data), type(seen_targets))
        X_seen_train, X_seen_val, y_seen_train, y_seen_val = seen_data[fst_index,:], seen_data[sec_index,:], seen_targets[fst_index], seen_targets[sec_index]
        
        # val-test split
        unseen_index = list(range(len(unseen_targets)))
        random.shuffle(unseen_index)
        split_point = int((1-unseen_ratio)*len(unseen_index))
        fst_index, sec_index = unseen_index[:split_point], unseen_index[split_point:]

        X_unseen_val, X_unseen_test, y_unseen_val, y_unseen_test = unseen_data[fst_index,:], unseen_data[sec_index,:], unseen_targets[fst_index], unseen_targets[sec_index]

        data = {'train': {
                        'X': X_seen_train,
                        'y': y_seen_train
                        },
                'eval-seen':{
                        'X': X_seen_val,
                        'y': y_seen_val
                        },
                'test': {
                        'X': unseen_data,
                        'y': unseen_targets
                        },
                'seen_classes': seen_classes,
                'unseen_classes': unseen_classes
                }

        return data

In [15]:
dataReader = PAMAP2Reader('../data/PAMAP2_Dataset/Protocol/')
actionList = dataReader.idToLabel

Reading file 1 of 14
Reading file 2 of 14
Reading file 3 of 14
Reading file 4 of 14
Reading file 5 of 14
Reading file 6 of 14
Reading file 7 of 14
Reading file 8 of 14
Reading file 9 of 14
Reading file 10 of 14
Reading file 11 of 14
Reading file 12 of 14
Reading file 13 of 14
Reading file 14 of 14


  return np.asarray(data), np.asarray(labels, dtype=int), np.array(collection)


[ 1  2  3  4  5  6  7  9 10 11 12 13 16 17 18 19 20 24]
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]


In [16]:
data_dict = dataReader.generate(unseen_classes=[1, 3], resampling=True, seen_ratio=0.2, unseen_ratio=0.8, window_size=5.21, window_overlap=4.21, resample_freq=10)

per class count :  {'lying': 8, 'sitting': 8, 'standing': 8, 'walking': 8, 'running': 6, 'cycling': 7, 'Nordic walking': 7, 'watching TV': 1, 'computer work': 4, 'car driving': 1, 'ascending stairs': 16, 'descending stairs': 17, 'vacuum cleaning': 8, 'ironing': 8, 'folding laundry': 4, 'house cleaning': 5, 'playing soccer': 2, 'rope jumping': 6}
>>  -24.17870812589971 38.55218330134357
<class 'list'> <class 'list'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [17]:
dataReader.data[1].max()

34.3

In [None]:
for m in dataReader.data:
    print(np.isnan(m).sum(), np.isinf(m).sum())

In [19]:
data_dict['train']['X'].shape

(18252, 36)

In [20]:
np.isinf(data_dict['train']['X']).sum()

0

In [21]:
data_dict['train']['X'].shape

(18252, 36)

In [22]:
np.isnan(data_dict['train']['X']).sum()

0

In [6]:
# training dataset
train_X, train_y = data_dict['train']['X'], data_dict['train']['y']
print("number of training samples : ", len(train_y))
s = np.unique(train_y, return_counts=True)
print("per class count : ", dict(zip(s[0], s[1])))

number of training samples :  18252
per class count :  {2: 1459, 4: 1844, 5: 760, 6: 1269, 7: 1472, 8: 685, 9: 2483, 10: 443, 11: 841, 12: 770, 13: 1367, 14: 1879, 15: 781, 16: 1451, 17: 377, 18: 371}


In [7]:
# Seen Evaluation dataset
Seval_X, Seval_y = data_dict['eval-seen']['X'], data_dict['eval-seen']['y']
print("number of training samples : ", len(Seval_y))
s = np.unique(Seval_y, return_counts=True)
print("per class count : ", dict(zip(s[0], s[1])))

number of training samples :  4563
per class count :  {2: 348, 4: 499, 5: 189, 6: 336, 7: 370, 8: 146, 9: 594, 10: 96, 11: 240, 12: 183, 13: 342, 14: 462, 15: 195, 16: 391, 17: 81, 18: 91}


In [8]:
# Unseen Eval dataset
test_X, test_y = data_dict['test']['X'], data_dict['test']['y']
print("number of training samples : ", len(test_y))
s = np.unique(test_y, return_counts=True)
print("per class count : ", dict(zip(s[0], s[1])))

number of training samples :  3734
per class count :  {1: 1880, 3: 1854}


In [9]:
print("Total number of data points : ", len(test_y)+len(Seval_y)+len(train_y))
print("Total number of unseen data : ", len(test_y))
print("Total number of seen data : ", len(Seval_y)+len(train_y))

Total number of data points :  26549
Total number of unseen data :  3734
Total number of seen data :  22815
