In [1]:
import pandas as pd
import numpy as np
import os
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from pprint import pprint

**Dataset configuration**

In [2]:
dataset = 'robotics-final'

dataset_path = '../datasets/' + dataset + '/'
output_path = '../datasets/' + dataset + '-processed/'
features_output_path = '../datasets/' + dataset + '-features/'

In [3]:
devices_to_use = [
    '128.237.246.127',
    '128.237.248.186',
    '128.237.247.134',
    '128.237.254.195', # this Mite is only in the Synergy kitchen dataset
    'DialogIoT 591844595',
    'DialogIoT 591844599',
    'DialogIoT 591844765',
    'Matrix b827eb96f31a',
    'Matrix b827ebe6e0f8',
    'Matrix b827eb41f96f',
    'TI SensorTag 604',
    'TI SensorTag 690',
    'TI SensorTag 85',
    'xdk_1',
    'xdk_2',
    'xdk_3'
]

columns_to_rename = {
    'ACCEL_sst_0_avg': 'accel_x',
    'ACCEL_sst_1_avg': 'accel_y',
    'ACCEL_sst_2_avg': 'accel_z',
    'MAGNETOMETER_sst_0_avg': 'mag_x',
    'MAGNETOMETER_sst_1_avg': 'mag_y',
    'MAGNETOMETER_sst_2_avg': 'mag_z',
    'HUMIDITY_sst_0_avg': 'humidity',
    'ILLUMINATION_sst_0_avg': 'light',
    'BAROMETER_sst_0_avg': 'pressure',
    'TEMPERATURE_sst_0_avg': 'temperature',
    'MICROPHONE_sst_0_avg': 'microphone',
    'microphone_avg': 'microphone',
    'magnetometer_x': 'mag_x',
    'magnetometer_y': 'mag_y',
    'magnetometer_z': 'mag_z'
}

columns_to_keep = [
   'accel_x',
   'accel_y',
   'accel_z',
   'gyro_x',
   'gyro_y',
   'gyro_z',
   'mag_x',
   'mag_y',
   'mag_z',
   'humidity',
   'light',
   'pressure',
   'temperature',
   'microphone'
]

**Merge the individual CSVs into one DataFrame per device for the chosen devices**

In [4]:
data_frames = {}

for root, dirs, files in os.walk(dataset_path):
    for file in files:
        device_id = None

        for device in devices_to_use:
            if file.startswith(device):
                device_id = device

        if device_id is not None:
            df = pd.DataFrame()
            if device_id in data_frames:
                df = data_frames[device_id]
            
            df_new = pd.DataFrame.from_csv(dataset_path + file)
            df_new.rename(index=str, columns=columns_to_rename, inplace=True)
            cols = []
            for column in df_new:
                if column in columns_to_keep:
                    cols.append(column)

            if len(cols) > 0:
                df_new = df_new[cols]
                df = df.join(df_new, how='outer')

                data_frames[device_id] = df

**Remove duplicate indices**

In [5]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df[~df.index.duplicated(keep='first')]
    data_frames[device_id] = df

**Convert index to datetime**

In [6]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.index = pd.to_datetime(df.index)
    data_frames[device_id] = df

**Fill missing values**

In [7]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df.ffill().bfill()
    data_frames[device_id] = df

**Create 5 sec activity windows**

In [8]:
window_size_secs = 5 # seconds
smallest_size_secs = 3 # seconds

activities = pd.DataFrame.from_csv(dataset_path + 'activities.csv')
labels = pd.DataFrame.from_csv(dataset_path + 'activity_labels.csv')
labeled_activities = activities.loc[activities.id != -1]

activity_windows = []

for id in labeled_activities.id.unique():
    start = labeled_activities.loc[labeled_activities.id == id].index.min()
    end = labeled_activities.loc[labeled_activities.id == id].index.max()

    since = start
    until = start + pd.DateOffset(seconds=window_size_secs)

    while since < end:
        label = labels.loc[id]['label']

        length = (until - since).seconds
        if length >= smallest_size_secs:
            activity_windows.append({
                'since': since,
                'until': until,
                'label': label,
                'length': length,
                'id': id
            })

        since = until
        until = until + pd.DateOffset(seconds=window_size_secs)
        until = min(until, end)

**Create per-activity dataframes which are not missing more than 1 second of data**

In [9]:
activity_data_frames = {}
device_labels = {}

removed_activities = {}
all_activities = {}
activity_id = 0

for device_id in data_frames:
    df = data_frames[device_id]
    data_frames[device_id] = None
    df['id'] = -1

    device_activities = []

    for i, window in enumerate(activity_windows):

        activity_df = df.loc[df.index >= window['since']]
        activity_df = activity_df.loc[activity_df.index <= window['until']].copy()

        seconds = activity_df.index.round('s').unique()

        key = device_id + ', activity ' + str(window['label'])
        if not key in all_activities:
            all_activities[key] = 0
            removed_activities[key] = 0

        if len(seconds) + 1 < window['length']:
            removed_activities[key] += 1
        else:
            activity_df['id'] = activity_id
            activity_data_frames[activity_id] = activity_df
            device_activities.append(pd.DataFrame({
                'id': [activity_id],
                'label': [window['label']],
                'activity_id': [window['id']],
                'since': [window['since']],
                'until': [window['until']],
                'window_id': [i]
            }))
            activity_id += 1

        all_activities[key] += 1

    df_labels = pd.concat(device_activities)
    df_labels = df_labels.set_index('id')
    device_labels[device_id] = df_labels

# print stats    
for key in all_activities:
    if removed_activities[key] > 0:
        print(str(removed_activities[key]) + ' out of ' + str(all_activities[key]) + ' removed for ' + key)

1 out of 74 removed for 128.237.246.127, activity 1
27 out of 74 removed for DialogIoT 591844599, activity 8


**Resample to 10Hz and fill missing values**

In [10]:
for activity_id in activity_data_frames:
    df = activity_data_frames[activity_id]
    df = df.resample('100L')
    df = df.ffill().bfill()
    activity_data_frames[activity_id] = df

**Update device data frames**

In [11]:
for device_id in device_labels:
    labels = device_labels[device_id]
    df_parts = [activity_data_frames[id] for id in labels.index]
    df = pd.concat(df_parts)
    data_frames[device_id] = df

**Replace timestamp indices with integer ones**

In [12]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.insert(0, 'i_index', pd.Series(range(len(df.index)), index=df.index))
    df = df.set_index(['i_index'])
    data_frames[device_id] = df

**Save as Pickle files to disk**

In [13]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.to_pickle(output_path + device_id + '.p')

    df_labels = device_labels[device_id]
    df_labels.to_pickle(output_path + device_id + '_labels.p')

**Extract features**

In [14]:
data_frame_features = {}

def extract_device_features(df):
    extracted_features = extract_features(df, column_id="id")
    impute(extracted_features)
    return extracted_features

for device_id in data_frames:
    df = data_frames[device_id]

    extracted = extract_device_features(df)
    data_frame_features[device_id] = extracted

Feature Extraction: 100%|██████████| 7/7 [02:24<00:00, 28.01s/it]
 'mag_z__friedrich_coefficients__m_3__r_30__coeff_3'
 'mag_z__friedrich_coefficients__m_3__r_30__coeff_2'
 'mag_z__friedrich_coefficients__m_3__r_30__coeff_1'
 'mag_z__friedrich_coefficients__m_3__r_30__coeff_0'
 'temperature__max_langevin_fixed_point__m_3__r_30'
 'temperature__friedrich_coefficients__m_3__r_30__coeff_3'
 'temperature__friedrich_coefficients__m_3__r_30__coeff_2'
 'temperature__friedrich_coefficients__m_3__r_30__coeff_1'
 'temperature__friedrich_coefficients__m_3__r_30__coeff_0'
 'accel_x__max_langevin_fixed_point__m_3__r_30'
 'accel_x__friedrich_coefficients__m_3__r_30__coeff_3'
 'accel_x__friedrich_coefficients__m_3__r_30__coeff_2'
 'accel_x__friedrich_coefficients__m_3__r_30__coeff_1'
 'accel_x__friedrich_coefficients__m_3__r_30__coeff_0'
 'mag_y__max_langevin_fixed_point__m_3__r_30'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_3'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_2'
 'mag_y__fried

Feature Extraction: 100%|██████████| 11/11 [02:44<00:00, 11.06s/it]
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_3'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_2'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_1'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_0'
 'light__max_langevin_fixed_point__m_3__r_30'
 'light__friedrich_coefficients__m_3__r_30__coeff_3'
 'light__friedrich_coefficients__m_3__r_30__coeff_2'
 'light__friedrich_coefficients__m_3__r_30__coeff_1'
 'light__friedrich_coefficients__m_3__r_30__coeff_0'
 'microphone__max_langevin_fixed_point__m_3__r_30'
 'microphone__friedrich_coefficients__m_3__r_30__coeff_3'
 'microphone__friedrich_coefficients__m_3__r_30__coeff_2'
 'microphone__friedrich_coefficients__m_3__r_30__coeff_1'
 'microphone__friedrich_coefficients__m_3__r_30__coeff_0'
 'accel_y__max_langevin_fixed_point__m_3__r_30'
 'accel_y__friedrich_coefficients__m_3__r_30__coeff_3'
 'accel_y__friedrich_coefficients__m_3__r_30__coeff_2'
 'accel_y__friedrich_

Feature Extraction: 100%|██████████| 10/10 [02:36<00:00, 11.34s/it]
 'gyro_y__friedrich_coefficients__m_3__r_30__coeff_3'
 'gyro_y__friedrich_coefficients__m_3__r_30__coeff_2'
 'gyro_y__friedrich_coefficients__m_3__r_30__coeff_1'
 'gyro_y__friedrich_coefficients__m_3__r_30__coeff_0'
 'light__max_langevin_fixed_point__m_3__r_30'
 'light__friedrich_coefficients__m_3__r_30__coeff_3'
 'light__friedrich_coefficients__m_3__r_30__coeff_2'
 'light__friedrich_coefficients__m_3__r_30__coeff_1'
 'light__friedrich_coefficients__m_3__r_30__coeff_0'
 'mag_y__max_langevin_fixed_point__m_3__r_30'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_3'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_2'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_1'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_0'
 'gyro_x__max_langevin_fixed_point__m_3__r_30'
 'gyro_x__friedrich_coefficients__m_3__r_30__coeff_3'
 'gyro_x__friedrich_coefficients__m_3__r_30__coeff_2'
 'gyro_x__friedrich_coefficients__m_3__r_30__

Feature Extraction: 100%|██████████| 12/12 [02:48<00:00,  7.33s/it]
 'accel_z__friedrich_coefficients__m_3__r_30__coeff_3'
 'accel_z__friedrich_coefficients__m_3__r_30__coeff_2'
 'accel_z__friedrich_coefficients__m_3__r_30__coeff_1'
 'accel_z__friedrich_coefficients__m_3__r_30__coeff_0'
 'mag_y__max_langevin_fixed_point__m_3__r_30'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_3'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_2'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_1'
 'mag_y__friedrich_coefficients__m_3__r_30__coeff_0'
 'mag_x__max_langevin_fixed_point__m_3__r_30'
 'mag_x__friedrich_coefficients__m_3__r_30__coeff_3'
 'mag_x__friedrich_coefficients__m_3__r_30__coeff_2'
 'mag_x__friedrich_coefficients__m_3__r_30__coeff_1'
 'mag_x__friedrich_coefficients__m_3__r_30__coeff_0'
 'gyro_y__max_langevin_fixed_point__m_3__r_30'
 'gyro_y__friedrich_coefficients__m_3__r_30__coeff_3'
 'gyro_y__friedrich_coefficients__m_3__r_30__coeff_2'
 'gyro_y__friedrich_coefficients__m_3__r_

**Sort feature columns**

In [15]:
for device_id in data_frame_features:
    df = data_frame_features[device_id]
    df = df.reindex_axis(sorted(df.columns), axis=1)
    data_frame_features[device_id] = df

**Store extracted features on disk**

In [16]:
for device_id in data_frame_features:
    extracted = data_frame_features[device_id]
    extracted.to_pickle(features_output_path + device_id + '.p')

    df_labels = device_labels[device_id]
    df_labels.to_pickle(features_output_path + device_id + '_labels.p')

**Reload extracted datasets from disk**

In [17]:
data_frame_features = {}
device_labels = {}

for root, dirs, files in os.walk(features_output_path):
    for file in files:
        device_id = None

        for device in devices_to_use:
            if file.startswith(device):
                device_id = device

        if device_id is not None:
            df = pd.read_pickle(features_output_path + file)
            if file.endswith('_labels.p'):
                device_labels[device_id] = df
            else:
                data_frame_features[device_id] = df

**Do feature selection**

In [18]:
data_frame_selected = {}

for device_id in device_labels:
    df = data_frame_features[device_id]
    df_labels = device_labels[device_id]
    selected = select_features(df, df_labels['label'])
    data_frame_selected[device_id] = selected













Feature Selection: 100%|██████████| 1554/1554 [00:00<00:00, 3258.98it/s]














Feature Selection: 100%|██████████| 2442/2442 [00:00<00:00, 3442.04it/s]
















Feature Selection: 100%|██████████| 2220/2220 [00:00<00:00, 3649.16it/s]
















Feature Selection: 100%|██████████| 2664/2664 [00:00<00:00, 3560.99it/s]














Feature Selection: 100%|██████████| 666/666 [00:00<00:00, 2542.19it/s]



































Feature Selection: 100%|██████████| 2442/2442 [00:00<00:00, 2484.95it/s]














Feature Selection: 100%|██████████| 2220/2220 [00:00<00:00, 3280.53it/s]




















Feature Selection: 100%|██████████| 3108/3108 [00:00<00:00, 3720.25it/s]












Feature Selection: 100%|██████████| 1554/1554 [00:00<00:00, 3155.40it/s]










Feature Selection: 100%|██████████| 1554/1554 [00:00<00:00, 3178.75it/s]


















Feature Selection: 100%|██████████| 2664/2664 [00:00<00:00, 3633.56it/s]


























Feature Selection: 100%|██████████| 2220/2220 [00:00<00:00, 3183.72it/s]



































Feature Selection: 100%|██████████| 3108/3108 [00:00<00:00, 3515.58it/s]


**Save to disk**

In [19]:
for device_id in data_frame_selected:
    selected = data_frame_selected[device_id]
    selected.to_pickle(features_output_path + device_id + '_selected.p')