In [12]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

width = 12
height = 7
plt.rcParams["figure.figsize"] = (width, height)


from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import tflscripts
import json
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from statistics import mode
from tsfresh import select_features
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

In [3]:
activities = [
    "Dishes",
    "Microwave",
    "Coffee",
    "Null",
    "Kettle",
    "Chopping food",
    "Conversation",
    "Eating popcorn",
    "Knocking",
    "Phone vibrating"
]

configuration = tflscripts.read_configuration()
activities_i = [configuration['activities'].index(a) for a in activities]

tflscripts.set_dataset_folder('/home/giotto/transfer-learning-playground/datasets/')

use_features = "^(?!mag|light)"

In [8]:
dataset = 'synergy-final-iter1-1s'
device = '128.237.254.195'

df, df_labels = tflscripts.read_and_filter_dataset(
        dataset,
        device,
        use_features='.*',
        use_activities=activities_i,
        scale=True,
        with_feature_selection=False)

for label in df_labels.label.unique():
    df_labels_modified = df_labels.copy()
    df_labels_modified.loc[df_labels.label != label, 'label'] = -1

    dfs = tflscripts.take_multiple_percentages_of_data(
            df, df_labels_modified,
            [0.7, 0.3])

    X_train, y_train = dfs[0]
    X_test, y_test = dfs[1]

    y_train = y_train['label']
    y_test = y_test['label']

    ppl = Pipeline([
        ('impute', Imputer()),
        ('clf', svm.SVC(kernel='linear', decision_function_shape='ovr', probability=True))
    ])

    ppl.fit(X_train, y_train)

    predicted = ppl.predict(X_test)
    proba = pd.DataFrame(ppl.predict_proba(X_test))
    proba.columns = [-1, label]
    [1]
    plt.show()

    accuracy = accuracy_score(y_test, predicted)
    print(configuration['activities'][label], accuracy)

Null 0.987027027027
Microwave 0.996756756757
Kettle 0.971891891892
Chopping food 0.961081081081
Coffee 0.907027027027
Dishes 1.0
Conversation 0.995675675676
Eating popcorn 0.967567567568
Phone vibrating 0.96972972973
Knocking 0.998918918919


In [13]:
source_dataset = 'synergy-final-iter1'
source_device = '128.237.254.195'
target_dataset = 'synergy-final-iter2'
target_device = '128.237.248.186'

df_source, df_source_labels = tflscripts.read_and_filter_dataset(
        source_dataset + '-1s',
        source_device,
        use_features='.*',
        use_activities=activities_i,
        scale=True,
        with_feature_selection=False)

df_target, df_target_labels = tflscripts.read_and_filter_dataset(
        target_dataset + '-1s',
        target_device,
        use_features='.*',
        use_activities=activities_i,
        scale=True,
        with_feature_selection=False)

df_source = df_source.loc[df_source.index.isin(df_source_labels.index)]
df_target = df_target.loc[df_target.index.isin(df_target_labels.index)]

df_source_labels = df_source_labels.loc[df_source_labels.index.isin(df_source.index)]
df_target_labels = df_target_labels.loc[df_target_labels.index.isin(df_target.index)]

for label in df_source_labels.label.unique():
    df_source_labels_modified = df_source_labels.copy()
    df_source_labels_modified.loc[df_source_labels_modified.label != label, 'label'] = -1

    df_target_labels_modified = df_target_labels.copy()
    df_target_labels_modified.loc[df_target_labels_modified.label != label, 'label'] = -1

    y_train = df_source_labels_modified['label']
    y_test = df_target_labels_modified['label']

#     print('Starting feature selection')
#     X_train = select_features(df_source, y_train)
#     X_test = df_target[X_train.columns]
#     print('Finished feature selection')

    ppl = Pipeline([
        ('impute', Imputer()),
        ('clf', svm.SVC(kernel='linear', decision_function_shape='ovr'))
    ])

    ppl.fit(df_source, y_train)

    predicted = ppl.predict(df_target)
    smoothed = tflscripts.smooth_predictions(predicted, label)
#     proba = pd.DataFrame({'predicted': predicted, 'smoothed': smoothed, 'actual': y_test.values})
#     proba.plot()
#     plt.show()

    accuracy = accuracy_score(y_test, smoothed)
    print(configuration['activities'][label], accuracy)

Null 0.853782013103
Microwave 0.984216795712
Kettle 0.820131030375
Chopping food 0.945205479452
Coffee 0.750744490768
Dishes 0.997022036927
Conversation 0.876116736152
Eating popcorn 0.82400238237
Phone vibrating 0.847528290649
Knocking 0.999702203693


In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train_s, y_test_s = train_test_split(df_source, y_train)

ppl = Pipeline([
    ('impute', Imputer()),
    ('clf', svm.SVC(kernel='linear', decision_function_shape='ovr'))
])

ppl.fit(X_train, y_train_s)

predicted = ppl.predict(X_test)
accuracy_score(y_test_s, predicted)

0.99870466321243523

In [31]:
y_train_s[y_train_s != -1]

4511    18
4516    18
4512    18
4509    18
4514    18
4513    18
4510    18
Name: label, dtype: int64