In [1]:
import pandas as pd
import numpy as np
import os

**Merge the individual CSVs into one DataFrame per device**

In [2]:
data_frames = {}
dataset_path = '../../datasets/synergy-kitchen-mites/'

for root, dirs, files in os.walk(dataset_path):
    for file in files:
        if file.startswith('128'):
            device_id = file.split('_')[0]
            df = pd.DataFrame()
            if device_id in data_frames:
                df = data_frames[device_id]
            
            df_new = pd.DataFrame.from_csv(dataset_path + file)
            df = df.join(df_new, how='outer')
            data_frames[device_id] = df

**Merge with labeled activities**

In [3]:
for device_id in data_frames:
    df = data_frames[device_id]
    df_act = pd.DataFrame.from_csv(dataset_path + 'activities.csv')
    df = df.join(df_act, how='outer')
    data_frames[device_id] = df

**Fill missing values**

In [4]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df.ffill().bfill()
    data_frames[device_id] = df

**Remove duplicate indices**

In [5]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df[~df.index.duplicated(keep='first')]
    data_frames[device_id] = df

**Resample to 10Hz**

In [6]:
for device_id in data_frames:
    df = data_frames[device_id]

    previous_length = len(df)
    df = df.resample('100L').ffill().bfill()
    print('Resampled from ' + str(previous_length) + ' to ' + str(len(df)))

    data_frames[device_id] = df

Resampled from 202735 to 21268
Resampled from 202853 to 21270
Resampled from 203379 to 21270
Resampled from 197783 to 21254


**Remove activities with ID -1 (unlabeled activities)**

In [11]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df[df.id != -1]
    data_frames[device_id] = df

**Split into 5s windows**

In [27]:
labels = pd.DataFrame.from_csv(dataset_path + 'activity_labels.csv')
windowed_labels = pd.DataFrame(columns=['label'])

for device_id in data_frames:
    df = data_frames[device_id]
    new_dfs = []

    window_size_secs = 5
    i = 0
    for id in labels.index:
        label = labels.loc[id]['label']

        activity_df = df[df['id'] == id]
        since = activity_df.index[0]
        until = since + pd.DateOffset(seconds=window_size_secs)

        while len(df[since:until]):
            new_df = df[since:until].copy()
            new_df.id = i

            new_dfs.append(new_df)

            since = until
            until = until + pd.DateOffset(seconds=5)

            windowed_labels.loc[i] = [label]
            i += 1

    df = pd.concat(new_dfs)
    data_frames[device_id] = df

**Replace timestamp indices with integer ones**

In [44]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.insert(0, 'i_index', pd.Series(range(len(df.index)), index=df.index))
    df = df.set_index(['i_index'])
    data_frames[device_id] = df

**Save as CSVs to disk**

In [45]:
output_path = '../../datasets/synergy-kitchen-mites-processed/'

windowed_labels.to_csv(output_path + 'activity_labels.csv')
for device_id in data_frames:
    df = data_frames[device_id]
    df.to_csv(output_path + device_id + '.csv')