In [1]:
import pandas as pd
import numpy as np
import os
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from pprint import pprint
from sqlalchemy import create_engine

  from pandas.core import datetools


**Dataset configuration**

In [2]:
dataset = 'robotics-final'

dataset_path = '../datasets/' + dataset + '/'
output_path = '../datasets/' + dataset + '-complete/'

In [3]:
devices_to_use = [
    '128.237.246.127',
    '128.237.248.186',
    '128.237.247.134',
    '128.237.234.0',
    '128.237.237.122',
    '128.237.239.234',
    '128.237.254.195', # this Mite is only in the Synergy kitchen dataset
    'DialogIoT 591844595',
    'DialogIoT 591844599',
    'DialogIoT 591844765',
    'Matrix b827eb96f31a',
    'Matrix b827ebe6e0f8',
    'Matrix b827eb41f96f',
    'TI SensorTag 604',
    'TI SensorTag 690',
    'TI SensorTag 85',
    'xdk_1',
    'xdk_2',
    'xdk_3',
    'TI SensorTag 33',
    'TI SensorTag 535',
    'TI SensorTag 709'
]

columns_to_keep = '|'.join([
    'ACCEL_',
    'MICROPHONE_',
    'BAROMETER_',
    'COLOT_',
])

**Merge the individual CSVs into one DataFrame per device for the chosen devices**

In [4]:
data_frames = {}

for root, dirs, files in os.walk(dataset_path):
    for file in files:
        device_id = None

        for device in devices_to_use:
            if file.startswith(device):
                device_id = device

        if device_id is not None:
            df_new = pd.DataFrame.from_csv(dataset_path + file)
            df_new.index = pd.to_datetime(df_new.index).round('100ms')

            df_new = df_new[df_new.columns.difference(['time'])]
            df_new = df_new[~df_new.index.duplicated(keep='first')]

            if device_id in data_frames:
                df = data_frames[device_id]
                df = pd.concat([df, df_new], join='outer', axis=1)
                data_frames[device_id] = df
            else:
                data_frames[device_id] = df_new

**Add labels**

In [5]:
activities = pd.DataFrame.from_csv(dataset_path + 'activities.csv')
labels = pd.DataFrame.from_csv(dataset_path + 'activity_labels.csv')

activities.index = pd.to_datetime(activities.index).round('100ms')

activities.id = [-1 if id == -1 else labels.label[id] for id in activities.id]
activities = activities.rename(columns={'id': 'label'})
activities = activities[~activities.index.duplicated(keep='first')]

for device_id in data_frames:
    df = data_frames[device_id]
    df = pd.concat([df, activities], join='outer', axis=1)
    data_frames[device_id] = df

**Fill missing values**

In [6]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df.ffill().bfill()
    data_frames[device_id] = df

**Remove unlabeled activities**

In [7]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df.loc[df.label != -1]
    data_frames[device_id] = df

**Replace timestamp indices with integer ones**

In [8]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.insert(0, 'i_index', pd.Series(range(len(df.index)), index=df.index))
    df = df.set_index(['i_index'])
    data_frames[device_id] = df

**Save as Pickle files to disk**

In [9]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.to_pickle(output_path + device_id + '.p')