In [21]:
import pandas as pd
import numpy as np

In [22]:
def read_data_frame(name):
    return pd.DataFrame.from_csv('../../datasets/matus-home/csv/' + name + '.csv')

labels = read_data_frame('labels')
activity_ids = read_data_frame('activity_ids')

def sensor_tag_accelerometer(sensor_tag_id):
    x = read_data_frame(sensor_tag_id + '_accelerometer_x')
    y = read_data_frame(sensor_tag_id + '_accelerometer_y')
    z = read_data_frame(sensor_tag_id + '_accelerometer_z')
    return [x, y, z]

def sensor_tag_magnetometer(sensor_tag_id):
    x = read_data_frame(sensor_tag_id + '_magnetometer_x')
    y = read_data_frame(sensor_tag_id + '_magnetometer_y')
    z = read_data_frame(sensor_tag_id + '_magnetometer_z')
    return [x, y, z]

def sensor_tag_gyroscope(sensor_tag_id):
    x = read_data_frame(sensor_tag_id + '_gyroscope_x')
    y = read_data_frame(sensor_tag_id + '_gyroscope_y')
    z = read_data_frame(sensor_tag_id + '_gyroscope_z')
    return [x, y, z]

def iphone_accelerometer():
    x = read_data_frame('matus_iphone_accelerometer_x')
    y = read_data_frame('matus_iphone_accelerometer_y')
    z = read_data_frame('matus_iphone_accelerometer_z')
    return [x, y, z]

def iphone_magnetometer():
    x = read_data_frame('matus_iphone_magnetometer_x')
    y = read_data_frame('matus_iphone_magnetometer_y')
    z = read_data_frame('matus_iphone_magnetometer_z')
    return [x, y, z]

def iphone_gravity():
    x = read_data_frame('matus_iphone_gravity_x')
    y = read_data_frame('matus_iphone_gravity_y')
    z = read_data_frame('matus_iphone_gravity_z')
    return [x, y, z]

def mac_microphone():
    return [read_data_frame('matus_mac_microphone')]

def wemo_power():
    return [read_data_frame('221445K12000EF_power')]


def build_df(sets):
    df = activity_ids
    for s in sets:
        df = df.join(s)
    df = df.ffill().bfill()
    return df

sets = []
# sets += sensor_tag_accelerometer('61e7e6b71abe49218f132bad935fcebe')
# sets += sensor_tag_accelerometer('778c5d0e6a744c1c961cdecdb2b29d3c')
# sets += sensor_tag_accelerometer('3a9d6f1b70dd493088305f679f4aa386')
# sets += sensor_tag_accelerometer('f0c688e5fc9e4df4ba87cbd394737884')
# sets += sensor_tag_magnetometer('61e7e6b71abe49218f132bad935fcebe')
# sets += sensor_tag_magnetometer('778c5d0e6a744c1c961cdecdb2b29d3c')
# sets += sensor_tag_magnetometer('3a9d6f1b70dd493088305f679f4aa386')
# sets += sensor_tag_magnetometer('f0c688e5fc9e4df4ba87cbd394737884')
# sets += sensor_tag_gyroscope(sensor_tag_id)
sets += iphone_accelerometer()
# sets += iphone_magnetometer()
# sets += iphone_gravity()
# sets += mac_microphone()
# sets += wemo_power()

df_source = build_df(sets)
df_source.columns = ['id', 'x', 'y', 'z']

sets = []
# sets += sensor_tag_accelerometer('61e7e6b71abe49218f132bad935fcebe')
sets += sensor_tag_accelerometer('778c5d0e6a744c1c961cdecdb2b29d3c')
# sets += sensor_tag_accelerometer('3a9d6f1b70dd493088305f679f4aa386')
# sets += sensor_tag_accelerometer('f0c688e5fc9e4df4ba87cbd394737884')
# sets += sensor_tag_magnetometer('61e7e6b71abe49218f132bad935fcebe')
# sets += sensor_tag_magnetometer('778c5d0e6a744c1c961cdecdb2b29d3c')
# sets += sensor_tag_magnetometer('3a9d6f1b70dd493088305f679f4aa386')
# sets += sensor_tag_magnetometer('f0c688e5fc9e4df4ba87cbd394737884')
# sets += sensor_tag_gyroscope(sensor_tag_id)
# sets += iphone_accelerometer()
# sets += iphone_magnetometer()
# sets += iphone_gravity()
# sets += mac_microphone()
# sets += wemo_power()


df_target = build_df(sets)
df_target.columns = ['id', 'x', 'y', 'z']

In [23]:
import sklearn.utils

def process_data_frame(df):
    # remove items with a duplicate index
    df = df[~df.index.duplicated(keep='first')]
    df.index.get_duplicates() # should be empty

    # split into 5s windows and resample for 100ms sampling rate
    windowed_labels = pd.DataFrame(columns=['label'])
    new_dfs = []

    window_size_secs = 5
    i = 0
    for id in labels.index:
        label = labels.loc[id]['label']

        activity_df = df[df['id'] == id]
        since = activity_df.index[0]
        until = since + pd.DateOffset(seconds=window_size_secs)

        while len(df[since:until]):
            new_df = df[since:until].copy()
            new_df.id = i

            # resample
            new_df = new_df.resample('100L').ffill().bfill()

            new_dfs.append(new_df)

            since = until
            until = until + pd.DateOffset(seconds=5)

            windowed_labels.loc[i] = [label]
            i += 1

    windowed_df = pd.concat(new_dfs)
    
    #windowed_df.reindex(pd.Series(index=range(len(windowed_df.index)))
    windowed_df['new_index'] = pd.Series(range(len(windowed_df.index)), index=windowed_df.index)
    windowed_df = windowed_df.set_index(['new_index'])

    return windowed_df, windowed_labels

In [24]:
df_source, source_labels = process_data_frame(df_source)
df_target, target_labels = process_data_frame(df_target)

y_source = source_labels['label']
y_source = sklearn.utils.shuffle(y_source)
X_source = pd.DataFrame(index=y_source.index)

y_target = target_labels['label']
y_target = sklearn.utils.shuffle(y_target)
X_target = pd.DataFrame(index=y_target.index)


# def training_testing_split(df, labels):
#     indexes = df.index.tolist()
#     np.random.shuffle(indexes)
#     split = round(0.7*len(indexes))
#     training_i = indexes[:split]
#     testing_i = indexes[split:]

#     df_train = df.loc[df.id.isin(training_i)]
#     df_test = df.loc[df.id.isin(testing_i)]

#     y_train = labels.loc[labels.index.isin(training_i)]
#     y_test = labels.loc[labels.index.isin(testing_i)]

#     y_train = y_train['label']
#     y_test = y_test['label']

#     X_train = pd.DataFrame(index=y_train.index)
#     X_test = pd.DataFrame(index=y_test.index)

#     return df_train, X_train, y_train, df_test, X_test, y_test

In [25]:
from tsfresh.transformers import RelevantFeatureAugmenter, FeatureAugmenter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA

In [26]:
from sklearn.base import BaseEstimator, TransformerMixin
from pprint import pprint


class ImputeInf(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        #pprint(X)
        #pprint(y)
        return self

    def transform(self, X):
        #pprint(X)
        #np.set_printoptions(threshold=np.nan)
        X[X == np.inf] = 0.0
        #pprint(np.isfinite(X))
        return X

In [27]:
ppl = Pipeline([
#     ('fresh', FeatureAugmenter(column_id='id')),
    ('fresh', RelevantFeatureAugmenter(column_id='id')),
    ('impute', Imputer()),
    ('imput_inf', ImputeInf()),
    #('impute_inf', Imputer(missing_values='inf', strategy='mean', axis=0)),
    ('scale', StandardScaler()),
#     ('pca', PCA()),
    #('standardscaler', StandardScaler(with_mean=True, with_std=True)),
    ('clf', RandomForestClassifier())
])

ppl.set_params(fresh__timeseries_container=df_target)
ppl.fit(X_target, y_target)

Feature Extraction: 100%|██████████| 3/3 [00:23<00:00,  7.88s/it]
Feature Extraction: 100%|██████████| 3/3 [00:08<00:00,  3.74s/it]


Pipeline(steps=[('fresh', RelevantFeatureAugmenter(column_id=None, column_kind=None, column_sort=None,
             column_value=None, evaluate_only_added_features=True,
             feature_extraction_settings=None,
             feature_selection_settings=None,
             timeseries_container=            ...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [28]:
ppl.set_params(fresh__timeseries_container=df_source)

y_pred = ppl.predict(X_source)

# print(classification_report(y_test, y_pred))
y_pred

Feature Extraction: 100%|██████████| 3/3 [00:08<00:00,  3.72s/it]


array([ 1.,  1.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  1.,
        2.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        2.,  1.,  1.,  2.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  2.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  2.,  2.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  1.,  1.,  2.,  1.,  1.,
        1.,  1.,  1.,  1.,  2.,  1.,  1.,  2.,  1.,  2.,  1.,  1.,  2.,
        1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  2.,  1.,  1.,  1.,  1.,  2.,  1.,  1.,  1.,  2.,  1.,  1.,
        2.,  1.,  1.,  1.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  1.,  1.,
        2.,  1.,  1.,  1.,  2.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  2.,  2.,  1.,  1.,  1.,  2.,  2.,  1.,  2.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  2.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1

In [29]:
filtered_index = y_source[y_source == y_pred].index
X_source_filtered = pd.DataFrame(index=filtered_index)
y_source_filtered = y_source[y_source == y_pred]

In [30]:
ppl = Pipeline([
#     ('fresh', FeatureAugmenter(column_id='id')),
    ('fresh', RelevantFeatureAugmenter(column_id='id')),
    ('impute', Imputer()),
    ('imput_inf', ImputeInf()),
    #('impute_inf', Imputer(missing_values='inf', strategy='mean', axis=0)),
    ('scale', StandardScaler()),
#     ('pca', PCA()),
    #('standardscaler', StandardScaler(with_mean=True, with_std=True)),
    ('clf', RandomForestClassifier())
])

ppl.set_params(fresh__timeseries_container=df_source)
ppl.fit(X_source_filtered, y_source_filtered)

ppl.set_params(fresh__timeseries_container=df_target)

y_pred = ppl.predict(X_target)

print(classification_report(y_target, y_pred))

Feature Extraction: 100%|██████████| 3/3 [00:03<00:00,  1.09s/it]



ValueError: Found array with 0 feature(s) (shape=(39, 0)) while a minimum of 1 is required.