In [1]:
import pandas as pd
import numpy as np
import os
from tsfresh import extract_features, select_features
from tsfresh.feature_extraction import MinimalFCParameters, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute, impute_dataframe_zero
from pprint import pprint

  from pandas.core import datetools


**Dataset configuration**

In [2]:
dataset = 'synergy-final-iter1'

dataset_path = '../../datasets/' + dataset + '/'
output_path = '../../datasets/' + dataset + '-1s-processed/'
features_output_path = '../../datasets/' + dataset + '-1s-features/'

In [3]:
devices_to_use = [
    '128.237.246.127',
    '128.237.248.186',
    '128.237.247.134',
    '128.237.234.0',
    '128.237.237.122',
    '128.237.239.234',
    '128.237.254.195', # this Mite is only in the Synergy kitchen dataset
    '128.237.227.76',
    '128.237.247.190',
    '128.237.250.218',
    'DialogIoT 591844595',
    'DialogIoT 591844599',
    'DialogIoT 591844765',
    'Matrix b827eb96f31a',
    'Matrix b827ebe6e0f8',
    'Matrix b827eb41f96f',
    'TI SensorTag 604',
    'TI SensorTag 690',
    'TI SensorTag 85',
    'xdk_1',
    'xdk_2',
    'xdk_3',
    'TI SensorTag 33',
    'TI SensorTag 535',
    'TI SensorTag 709'
]

columns_to_rename = {
    'ACCEL_sst_0_avg': 'accel_x',
    'ACCEL_sst_1_avg': 'accel_y',
    'ACCEL_sst_2_avg': 'accel_z',
    'MAGNETOMETER_sst_0_avg': 'mag_x',
    'MAGNETOMETER_sst_1_avg': 'mag_y',
    'MAGNETOMETER_sst_2_avg': 'mag_z',
    'HUMIDITY_sst_0_avg': 'humidity',
    'ILLUMINATION_sst_0_avg': 'light',
    'BAROMETER_sst_0_avg': 'pressure',
    'TEMPERATURE_sst_0_avg': 'temperature',
    'MICROPHONE_sst_0_avg': 'microphone',
    'MICROPHONE_sst_0_min': 'microphone_min',
    'MICROPHONE_sst_0_max': 'microphone_max',
    'MICROPHONE_sst_0_sum': 'microphone_sum',
    'MICROPHONE_sst_0_variance': 'microphone_variance',
    'MICROPHONE_sst_0_range': 'microphone_range',
    'MICROPHONE_sst_0_centroid': 'microphone_centroid',
    'microphone_avg': 'microphone',
    'magnetometer_x': 'mag_x',
    'magnetometer_y': 'mag_y',
    'magnetometer_z': 'mag_z'
}

columns_to_keep = '|'.join([
    'accel_',
    'gyro_',
    'mag_',
    'humidity',
    'light',
    'pressure',
    'temperature',
    'microphone',
    'ACCEL_fft_',
    'ACCEL_sst_',
    'MICROPHONE_fft_',
    'EMI',
    'IRMOTION'
])

**Merge the individual CSVs into one DataFrame per device for the chosen devices**

In [4]:
data_frames = {}

for root, dirs, files in os.walk(dataset_path):
    for file in files:
        device_id = None

        for device in devices_to_use:
            if file.startswith(device):
                device_id = device

        if device_id is not None:
            df = pd.DataFrame()
            if device_id in data_frames:
                df = data_frames[device_id]

            df_new = pd.DataFrame.from_csv(dataset_path + file)
            df_new.rename(index=str, columns=columns_to_rename, inplace=True)

            df_new.index = pd.to_datetime(df_new.index).round('100ms')

            df_new = df_new.filter(regex=(columns_to_keep))
#             df_new = df_new[df_new.columns.difference(['time'])]
            df_new = df_new[~df_new.index.duplicated(keep='first')]

            if len(df_new.columns) > 0:
                df = pd.concat([df, df_new], join='outer', axis=1)

                data_frames[device_id] = df

  if self.run_code(code, result):
  if self.run_code(code, result):
  if self.run_code(code, result):
  self._values[0] < other_diff[0]
  if self.run_code(code, result):


KeyboardInterrupt: 

**Remove duplicate indices**

In [None]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df[~df.index.duplicated(keep='first')]
    data_frames[device_id] = df

**Convert index to datetime**

In [None]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.index = pd.to_datetime(df.index)
    data_frames[device_id] = df

**Fill missing values**

In [None]:
for device_id in data_frames:
    df = data_frames[device_id]
    df = df.ffill().bfill()
    data_frames[device_id] = df

**Create 1 sec activity windows**

In [None]:
window_size_secs = 1 # seconds
smallest_size_secs = 1 # seconds

activities = pd.DataFrame.from_csv(dataset_path + 'activities.csv')
labels = pd.DataFrame.from_csv(dataset_path + 'activity_labels.csv')
labeled_activities = activities.loc[activities.id != -1]

activity_windows = []

for id in labeled_activities.id.unique():
    start = labeled_activities.loc[labeled_activities.id == id].index.min()
    end = labeled_activities.loc[labeled_activities.id == id].index.max()

    since = start
    until = start + pd.DateOffset(seconds=window_size_secs)

    while since < end:
        label = labels.loc[id]['label']

        length = (until - since).seconds
        if length >= smallest_size_secs:
            activity_windows.append({
                'since': since,
                'until': until,
                'label': label,
                'length': length,
                'id': id
            })

        since = until
        until = until + pd.DateOffset(seconds=window_size_secs)
        until = min(until, end)

**Create per-activity dataframes which are not missing more than 1 second of data**

In [None]:
activity_data_frames = {}
device_labels = {}

removed_activities = {}
all_activities = {}
activity_id = 0

for device_id in data_frames:
    df = data_frames[device_id]
    data_frames[device_id] = None
    df['id'] = -1

    device_activities = []

    for i, window in enumerate(activity_windows):

        activity_df = df.loc[df.index >= window['since']]
        activity_df = activity_df.loc[activity_df.index <= window['until']].copy()

        seconds = activity_df.index.round('s').unique()

        key = device_id + ', activity ' + str(window['label'])
        if not key in all_activities:
            all_activities[key] = 0
            removed_activities[key] = 0

        if len(seconds) < window['length'] or len(activity_df) <= 3:
            removed_activities[key] += 1
        else:
            activity_df['id'] = activity_id
            activity_data_frames[activity_id] = activity_df
            device_activities.append(pd.DataFrame({
                'id': [activity_id],
                'label': [window['label']],
                'activity_id': [window['id']],
                'since': [window['since']],
                'until': [window['until']],
                'window_id': [i]
            }))
            activity_id += 1

        all_activities[key] += 1

    df_labels = pd.concat(device_activities)
    df_labels = df_labels.set_index('id')
    device_labels[device_id] = df_labels

# print stats    
for key in all_activities:
    if removed_activities[key] > 0:
        print(str(removed_activities[key]) + ' out of ' + str(all_activities[key]) + ' removed for ' + key)

**Resample to 10Hz and fill missing values**

In [None]:
for activity_id in activity_data_frames:
    df = activity_data_frames[activity_id]
    df = df.resample('100L')
    df = df.ffill().bfill()
    activity_data_frames[activity_id] = df

**Update device data frames**

In [None]:
for device_id in device_labels:
    labels = device_labels[device_id]
    df_parts = [activity_data_frames[id] for id in labels.index]
    df = pd.concat(df_parts)
    data_frames[device_id] = df

**Replace timestamp indices with integer ones**

In [None]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.insert(0, 'i_index', pd.Series(range(len(df.index)), index=df.index))
    df = df.set_index(['i_index'])
    data_frames[device_id] = df

**Save as Pickle files to disk**

In [17]:
for device_id in data_frames:
    df = data_frames[device_id]
    df.to_pickle(output_path + device_id + '.p')

    df_labels = device_labels[device_id]
    df_labels.to_pickle(output_path + device_id + '_labels.p')

**Reload extracted datasets from disk**

In [None]:
data_frames = {}
device_labels = {}

for root, dirs, files in os.walk(output_path):
    for file in files:
        device_id = None

        for device in devices_to_use:
            if file.startswith(device):
                device_id = device

        if device_id is not None:
            df = pd.read_pickle(output_path + file)
            if file.endswith('_labels.p'):
                device_labels[device_id] = df
            else:
                data_frames[device_id] = df

**Extract features**

In [None]:
data_frame_features = {}

def extract_device_features(df):
    impute(df)

    fft_columns = [col for col in df.columns if '_fft_' in col]
    non_fft_columns = [col for col in df.columns if '_fft_' not in col]

    if len(fft_columns) > 0:
        fft_columns = fft_columns + ['id']

        df_fft = df[fft_columns]
        df_non_fft = df[non_fft_columns]

        print('Start extracting FFT features')
        extracted_features_fft = extract_features(df_fft,
                                              column_id="id",
                                              default_fc_parameters=MinimalFCParameters())

        print('Start extracting non-FFT features')
        extracted_features_non_fft = extract_features(df_non_fft,
                                              column_id="id",
                                              default_fc_parameters=EfficientFCParameters())

        extracted_features = pd.concat([extracted_features_fft, extracted_features_non_fft], axis=1)
    else:
        print('Start extracting non-FFT features')
        extracted_features = extract_features(df,
                                              column_id="id",
                                              default_fc_parameters=EfficientFCParameters())
    print('Finished extracting features')
    impute_dataframe_zero(extracted_features)
    print('Imputed data')

    return extracted_features

for device_id in data_frames:
    print(device_id)
    df = data_frames[device_id]

    extracted = extract_device_features(df)
    data_frame_features[device_id] = extracted

DialogIoT 591844599
Start extracting non-FFT features


Feature Extraction: 100%|██████████| 7/7 [05:44<00:00, 67.90s/it] 


Finished extracting features
Imputed data
Matrix b827ebe6e0f8
Start extracting non-FFT features


Feature Extraction: 100%|██████████| 18/18 [09:35<00:00, 13.98s/it]


Finished extracting features
Imputed data
128.237.254.195
Start extracting FFT features


Feature Extraction: 100%|██████████| 512/512 [03:34<00:00,  2.39it/s] 


Start extracting non-FFT features


Feature Extraction: 100%|██████████| 49/49 [24:17<00:00, 24.50s/it]  


Finished extracting features
Imputed data
128.237.248.186
Start extracting FFT features


Feature Extraction: 100%|██████████| 512/512 [03:57<00:00,  2.15it/s] 


Start extracting non-FFT features


Feature Extraction: 100%|██████████| 49/49 [27:15<00:00, 16.91s/it]  


Finished extracting features
Imputed data
Matrix b827eb96f31a
Start extracting non-FFT features


Feature Extraction: 100%|██████████| 9/9 [05:55<00:00, 35.70s/it] 


Finished extracting features
Imputed data
TI SensorTag 85
Start extracting non-FFT features


Feature Extraction: 100%|██████████| 10/10 [05:01<00:00, 22.47s/it]


Finished extracting features
Imputed data
xdk_1
Start extracting non-FFT features


Feature Extraction:  86%|████████▌ | 12/14 [06:25<00:29, 14.86s/it]

**Sort feature columns**

In [15]:
for device_id in data_frame_features:
    df = data_frame_features[device_id]
    df = df.reindex_axis(sorted(df.columns), axis=1)
    data_frame_features[device_id] = df

**Store extracted features on disk**

In [16]:
for device_id in data_frame_features:
    extracted = data_frame_features[device_id]
    extracted.to_pickle(features_output_path + device_id + '.p')

    df_labels = device_labels[device_id]
    df_labels.to_pickle(features_output_path + device_id + '_labels.p')

**Do feature selection**

In [17]:
data_frame_selected = {}

for device_id in device_labels:
    df = data_frame_features[device_id]
    df_labels = device_labels[device_id]
    selected = select_features(df, df_labels['label'])
    data_frame_selected[device_id] = selected







**Save to disk**

In [18]:
for device_id in data_frame_selected:
    selected = data_frame_selected[device_id]
    selected.to_pickle(features_output_path + device_id + '_selected.p')

In [None]:
1