In [37]:
import numpy as np
import h5py

DATA_PATH = '../User Identification From Walking Activity/'
PARTICIPANTS_PATHS = [DATA_PATH+str(i)+'.csv' for i in xrange(1,22)]

In [39]:
def split(array, chunk_number, sample_size, overlaping_size):
    '''
    Split the 2d arrays in samples as a 3d array with rows a sample of size
    sample_size and overlaping with its surrounding on overlaping_size timestamps.
    '''
    array_split = np.zeros((chunk_number, sample_size + overlaping_size, array.shape[1]))
    # Bounds: choice of overlaping more with consecutive TS 
    array_split[0,: , :] = array[: sample_size + overlaping_size]
    array_split[-1,: , :] = array[- (sample_size + overlaping_size):]
    # Body
    for i in xrange(1, chunk_number - 1):
        array_split[i,: , :] = array[sample_size*i - overlaping_size/2: sample_size*(i+1) + overlaping_size/2]
    return array_split

In [51]:
# Dividing set into overlapping samples as a 3d matrix:
# (sample_id, timestamp, coordinates)

# Input parameters:
# sample_size: number of timestamps in each sample
# overlaping_percentage: percentage of the sample_size overlaping with surrounding timeseries
#                        added to each sample
sample_size = 50
overlaping_percentage = 0.4

overlaping_size = int(round(overlaping_percentage * sample_size))

In [52]:
# Routines to build chunks for each participant
participants_train = {}
participants_test = {}
for i, p_path in enumerate(PARTICIPANTS_PATHS):
    # Loading data
    participant = np.loadtxt(p_path, delimiter=',')
    # Test set: 10 last percent of the TS
    test_index = int(0.9*participant.shape[0])
    train = participant[:test_index]
    test = participant[test_index:]
    # Number of chunks
    chunk_number_train = train.shape[0]/sample_size 
    chunk_number_test = test.shape[0]/sample_size 
    train_split = split(train, chunk_number_train, sample_size, overlaping_size)
    test_split = split(test, chunk_number_test, sample_size, overlaping_size)
    
    participants_train[i+1] = train_split
    participants_test[i+1] = test_split

In [55]:
# Saving the split
with h5py.File('../data/split_{}_{}.hdf5'.format(sample_size, overlaping_percentage), "w") as f:
    for k,v in participants_train.iteritems():
        f['train_{}'.format(k)] = v
    for k,v in participants_test.iteritems():
        f['test_{}'.format(k)] = v

In [59]:
# Loading the split
participants_train = {}
participants_test = {}
filename = '../data/split_{}_{}.hdf5'.format(sample_size, overlaping_percentage)
with h5py.File(filename,'r') as hf:
    for k in xrange(1,21+1):
        participants_train[k] = np.array(hf.get('train_{}'.format(k)))
        participants_test[k] = np.array(hf.get('test_{}'.format(k)))