In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

width = 12
height = 7
plt.rcParams["figure.figsize"] = (width, height)


from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFeatureExtractionSettings, ReasonableFeatureExtractionSettings
import tflscripts
import json
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from scipy.spatial.distance import cdist
import math
from scipy.spatial.distance import pdist, squareform


configuration = tflscripts.read_configuration()

  from pandas.core import datetools


In [2]:
def read_complete_dataset(dataset,
                          device,
                          activities,
                          anomaly_percentile=100):

    dataset_folder = '../datasets/'

    dataset_path = dataset_folder + dataset + '-complete/'
    df = pd.read_pickle(dataset_path + device + '.p')
#     df = df.filter(regex=sensor_streams + '|label')

    value_columns = df.filter(regex='^(?!label)').columns

    mean = df[value_columns].mean()
    std = df[value_columns].std()
    df[value_columns] = df[value_columns].clip(mean - 2 * std, mean + 2 * std, axis=1)

    null_df = df.loc[df.label == configuration['activities'].index('Dishes')]

    null_mean = null_df[value_columns].mean()
    null_std = null_df[value_columns].std()
    df[value_columns] = (df[value_columns] - null_mean) / null_std
    df = df.replace([np.inf, -np.inf, np.nan], 0)

    if anomaly_percentile < 100:
        anomalies = (df[value_columns] ** 2).sum(axis=1).apply(np.sqrt)
        df['anomalies'] = anomalies

#     df[value_columns] = StandardScaler().fit_transform(df[value_columns])

    activities_i = [configuration['activities'].index(a) for a in activities]
    df = df.loc[df.label.isin(activities_i)]

    if anomaly_percentile < 100:
        anomaly_threshold = np.percentile(df.anomalies.values, 100 - anomaly_percentile)
        print(anomaly_threshold)
        df = df.loc[df.anomalies > anomaly_threshold]

    split = np.random.rand(len(df)) < 0.2
    df = df[split]

    return df

In [4]:
def get_x_and_y():
    activities = [
        "Dishes",
        "Microwave",
        "Coffee",
    #     "Null",
        "Kettle",
        "Chopping food",
    #     "Conversation",
    #     "Eating popcorn",
    #     "Making popcorn in microwave",
        "Knocking",
    #     "Phone vibrating"
    ]

    tests = [
        ['synergy-final-iter1', '128.237.254.195'],
#         ['synergy-final-iter2', '128.237.248.186'],
#         ['scott-final-iter1', '128.237.247.134'],
#         ['scott-final-iter1', '128.237.248.186'],
#         ['robotics-final', '128.237.246.127'],
#         ['robotics-final', '128.237.247.134'],
    ]

    test_labels = [
        'Synergy, iteration 1, sink',
        'Synergy, iteration 2, sink',
        'Scott, right',
        'Scott, left',
        'Robotics, coffee',
        'Robotics, sink',
    ]

    dfs = []

    for k, test in enumerate(tests):
        df = read_complete_dataset(anomaly_percentile=100,
                                   activities=activities,
                                   dataset=test[0],
                                   device=test[1])
        dfs.append(df)

    df = pd.concat(dfs)
    x = df.filter(regex='^(?!label)')
    y = df['label']

    return x, y

x, y = get_x_and_y()

In [5]:
clf = svm.SVC(kernel='linear', decision_function_shape='ovr')
clf.fit(x, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
activities = [
    "Dishes",
    "Microwave",
    "Coffee",
    "Kettle",
    "Chopping food",
    "Knocking",
]

df = read_complete_dataset(anomaly_percentile=100,
                                   activities=activities,
                                   dataset='synergy-final-iter2',
                                   device='128.237.248.186')
x_test = df.filter(regex='^(?!label)')
y_test = df['label']
accuracy_score(clf.predict(x_test), y_test)

0.20086980157651535

In [9]:
coefficients = pd.DataFrame(clf.coef_)
coefficients.columns = x.columns
coefficients.index = [configuration['activities'][int(i)] for i in clf.classes_]
coefficients = coefficients.T
coefficients['max'] = coefficients.max(axis=1)
coefficients = coefficients.sort_values('max', ascending=False)

ValueError: Length mismatch: Expected axis has 15 elements, new values have 6 elements

In [8]:
coefficients[coefficients.index.str.startswith('MICROPHONE')].plot.bar()

NameError: name 'coefficients' is not defined