In [1]:
import tflscripts
from tflscripts import take_percentage_of_data, read_dataset, filter_by_features, X_sort, filter_by_activities_transfer, classify, easy_domain_adaptation_update_dataframes, take_multiple_percentages_of_data, concat_and_reindex, filter_by_activities, read_and_filter_dataset, build_pipeline
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.stats import mode
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix

In [2]:
configuration = tflscripts.read_configuration()
use_features = '.*'
force_columns = None
use_columns = None
use_activities = [configuration['activities'].index(a) for a in configuration['activity_sets'][0]]

training_source_data_ratio = 0.6
training_target_data_ratio = 0.6
testing_target_data_ratio = 0.4

tflscripts.set_dataset_folder('/home/giotto/transfer-learning-playground/datasets/')

In [3]:
sources = [
    {
        'dataset': 'synergy-final-iter1-2s',
        'device': '128.237.254.195'
    },
    {
        'dataset': 'synergy-final-iter1-2s',
        'device': '128.237.246.127'
    },
    {
        'dataset': 'synergy-final-iter2-2s',
        'device': '128.237.248.186'
    },
    {
        'dataset': 'synergy-final-iter2-2s',
        'device': '128.237.254.195'
    },
    {
        'dataset': 'robotics-final-2s',
        'device': '128.237.246.127'
    },
    {
        'dataset': 'robotics-final-2s',
        'device': '128.237.247.134'
    }
]

target_dataset = 'scott-final-iter1-2s'
target_device = '128.237.247.134'
target_teacher = 'xdk_2'

In [4]:
source_ppls = []

for source in sources:
    # read datasets
    df, df_labels = read_and_filter_dataset(
        source['dataset'],
        source['device'],
        use_features=use_features,
        force_columns=force_columns,
        use_columns=use_columns,
        use_activities=use_activities,
        scale=True,
        with_feature_selection=False)

    ppl = build_pipeline(df, df_labels, scale=True)

    source_ppls.append(ppl)

In [5]:
df_target, df_target_labels = read_and_filter_dataset(
        target_dataset,
        target_device,
        use_features=use_features,
        force_columns=force_columns,
        use_columns=use_columns,
        use_activities=use_activities,
        scale=True,
        with_feature_selection=False)

In [6]:
all_probs = [ppl.predict_proba(df_target) for ppl in source_ppls]
for i, probs in enumerate(all_probs):
    df_target['source_' + str(i) + '_pred'] = \
        [ppl.classes_[np.argmax(prob)] for prob in probs]
    df_target['source_' + str(i) + '_prob'] = \
        [np.max(prob) for prob in probs]

df_target = df_target.filter(regex='source_')

In [7]:
dfs = take_multiple_percentages_of_data(df_target, df_target_labels, [0.7, 0.3])
df_train, df_train_labels = dfs[0]
df_test, df_test_labels = dfs[1]

ppl = build_pipeline(df_train, df_train_labels, scale=True)
accuracy_score(df_test_labels['label'], ppl.predict(df_test))

0.67875647668393779

In [8]:
preds = [mode(preds)[0][0] for preds in df_target.filter(regex='source_.*_pred').values]

accuracy_score(df_target_labels['label'], preds)

0.23212435233160622

In [9]:
args = [np.argmax(probs) for probs in df_target.filter(regex='source_.*_prob').values]
preds = [preds[args[i]] for i, preds in enumerate(df_target.filter(regex='source_.*_pred').values)]
accuracy_score(df_target_labels['label'], preds)

0.19170984455958548

In [10]:
preds = df_target.filter(regex='source_.*_pred').values
pot = [label in preds[i] for i, label in enumerate(df_target_labels['label'])]
len([p for p in pot if p]) / len(pot)

0.6077720207253886

In [11]:
df_target, df_target_labels = read_and_filter_dataset(
        target_dataset,
        target_device,
        use_features=use_features,
        force_columns=force_columns,
        use_columns=use_columns,
        use_activities=use_activities,
        scale=True,
        with_feature_selection=False)
df_target_labels

Unnamed: 0,activity_id,label,since,until,window_id
0,1,3,2017-03-30 01:51:36.170,2017-03-30 01:51:38.170,0
1,1,3,2017-03-30 01:51:38.170,2017-03-30 01:51:40.170,1
2,1,3,2017-03-30 01:51:40.170,2017-03-30 01:51:42.170,2
3,1,3,2017-03-30 01:51:42.170,2017-03-30 01:51:44.170,3
4,1,3,2017-03-30 01:51:44.170,2017-03-30 01:51:46.170,4
5,1,3,2017-03-30 01:51:46.170,2017-03-30 01:51:48.170,5
6,1,3,2017-03-30 01:51:48.170,2017-03-30 01:51:50.170,6
7,1,3,2017-03-30 01:51:50.170,2017-03-30 01:51:52.170,7
8,1,3,2017-03-30 01:51:52.170,2017-03-30 01:51:54.170,8
9,1,3,2017-03-30 01:51:54.170,2017-03-30 01:51:56.170,9


In [12]:
source_datasets = ['synergy-final-iter1', 'robotics-final']
target_dataset = 'scott-final-iter1'

def dataset_devices(dataset):
    configuration = tflscripts.read_configuration()
    all_devices = configuration['device_roles'][dataset].keys()
    return [d for d in all_devices if d.startswith('128.237')]

In [13]:
def test_ensemble(num_source_datasets, source_device_list, target_dataset, use_features, force_columns, use_columns, use_activities):
    target_devices = dataset_devices(target_dataset)
    results = []

    for target_device in target_devices:

        df_target, df_target_labels = read_and_filter_dataset(
            target_dataset,
            target_device,
            use_features=use_features,
            force_columns=force_columns,
            use_columns=use_columns,
            use_activities=use_activities,
            scale=True,
            with_feature_selection=False)
        le_ = LabelEncoder()
        le_.fit(df_target_labels['label'])

        source_ppls = []

        for device, dataset in source_device_list:
            df, df_labels = read_and_filter_dataset(
                dataset,
                device,
                use_features=use_features,
                force_columns=force_columns,
                use_columns=use_columns,
                use_activities=use_activities,
                scale=True,
                with_feature_selection=False)

            ppl = build_pipeline(df, df_labels, scale=True)

            source_ppls.append(ppl)

        pred = np.asarray([ppl.predict_proba(df_target) for ppl in source_ppls])

        pred = np.average(pred, axis=0)
        pred = np.argmax(pred, axis=1)
        pred = le_.inverse_transform(pred)

#         pred = np.asarray([ppl.predict(df_target) for ppl in source_ppls])
#         pred = [mode(p)[0][0] for p in pred.T]
        
        accuracy = accuracy_score(df_target_labels['label'], pred)
        result = [num_source_datasets, len(source_device_list) / num_source_datasets, accuracy]
        print(result)
        results.append(result)
    return results

def get_results_with_2_sources(source_datasets, target_dataset, use_features, force_columns, use_columns, use_activities):
    results = []
    for _ in range(5):
        for num_devices_per_source in range(3):
            num_devices_per_source += 1
            ds1_combinations = combinations(dataset_devices(source_datasets[0]), num_devices_per_source)
            ds2_combinations = combinations(dataset_devices(source_datasets[1]), num_devices_per_source)
            two_source_combinations = [[c1, c2] for c1 in ds1_combinations for c2 in ds2_combinations]

            for combination in two_source_combinations:
                source_device_list = [[device, dataset] for i, dataset in enumerate(source_datasets) for device in combination[i]]
                results += test_ensemble(2,
                                         source_device_list,
                                         target_dataset,
                                         use_features,
                                         force_columns,
                                         use_columns,
                                         use_activities)

            for dataset in source_datasets:
                device_combinations = combinations(dataset_devices(dataset), num_devices_per_source)
                for combination in device_combinations:
                    source_device_list = [[device, dataset] for device in combination]
                    results += test_ensemble(1,
                                         source_device_list,
                                         target_dataset,
                                         use_features,
                                         force_columns,
                                         use_columns,
                                         use_activities)
                

    return results

results_with_2_sources = get_results_with_2_sources(source_datasets,
                                 target_dataset,
                                 use_features='microphone.*index_mass_quantile',
                                 force_columns=force_columns,
                                 use_columns=use_columns,
                                 use_activities=use_activities)

[2, 1.0, 0.087012987012987014]
[2, 1.0, 0.091968911917098439]
[2, 1.0, 0.080310880829015538]
[2, 1.0, 0.1012987012987013]
[2, 1.0, 0.08937823834196891]
[2, 1.0, 0.080310880829015538]
[2, 1.0, 0.092207792207792211]
[2, 1.0, 0.081606217616580309]
[2, 1.0, 0.09585492227979274]
[1, 1.0, 0.094805194805194809]
[1, 1.0, 0.066062176165803108]
[1, 1.0, 0.082901554404145081]
[1, 1.0, 0.075324675324675322]
[1, 1.0, 0.091968911917098439]
[1, 1.0, 0.091968911917098439]
[1, 1.0, 0.090909090909090912]
[1, 1.0, 0.11010362694300518]
[1, 1.0, 0.09585492227979274]
[1, 1.0, 0.087012987012987014]
[1, 1.0, 0.086787564766839381]
[1, 1.0, 0.098445595854922283]
[1, 1.0, 0.11168831168831168]
[1, 1.0, 0.09974093264248704]
[1, 1.0, 0.10103626943005181]
[1, 1.0, 0.084415584415584416]
[1, 1.0, 0.073834196891191708]
[1, 1.0, 0.10492227979274611]
[2, 2.0, 0.098701298701298706]
[2, 2.0, 0.094559585492227982]
[2, 2.0, 0.11139896373056994]
[2, 2.0, 0.083116883116883117]
[2, 2.0, 0.080310880829015538]
[2, 2.0, 0.09715025

[1, 1.0, 0.082901554404145081]
[1, 1.0, 0.090673575129533682]
[1, 1.0, 0.080519480519480519]
[1, 1.0, 0.084196891191709838]
[1, 1.0, 0.084196891191709838]
[1, 1.0, 0.097402597402597407]
[1, 1.0, 0.10880829015544041]
[1, 1.0, 0.091968911917098439]
[1, 1.0, 0.10000000000000001]
[1, 1.0, 0.08937823834196891]
[1, 1.0, 0.11010362694300518]
[2, 2.0, 0.09350649350649351]
[2, 2.0, 0.098445595854922283]
[2, 2.0, 0.098445595854922283]
[2, 2.0, 0.083116883116883117]
[2, 2.0, 0.084196891191709838]
[2, 2.0, 0.10362694300518134]
[2, 2.0, 0.084415584415584416]
[2, 2.0, 0.086787564766839381]
[2, 2.0, 0.08937823834196891]
[1, 2.0, 0.10779220779220779]
[1, 2.0, 0.082901554404145081]
[1, 2.0, 0.093264248704663211]
[1, 2.0, 0.085714285714285715]
[1, 2.0, 0.076424870466321237]
[1, 2.0, 0.09585492227979274]
[1, 2.0, 0.11168831168831168]
[1, 2.0, 0.084196891191709838]
[1, 2.0, 0.093264248704663211]
[1, 2.0, 0.090909090909090912]
[1, 2.0, 0.093264248704663211]
[1, 2.0, 0.097150259067357511]
[1, 2.0, 0.0844155

In [61]:
df = pd.DataFrame(results_with_2_sources)
df.columns = ['num_source_datasets', 'num_devices_per_source', 'accuracy']
df.groupby(['num_source_datasets', 'num_devices_per_source'])['accuracy'].mean()

num_source_datasets  num_devices_per_source
1                    1.0                       0.437722
                     2.0                       0.441797
                     3.0                       0.445240
2                    1.0                       0.452142
                     2.0                       0.455218
                     3.0                       0.462220
Name: accuracy, dtype: float64

In [165]:
df_source, df_source_labels = read_and_filter_dataset(
        'robotics-final',
        'xdk_1',
        use_features=use_features,
        force_columns=force_columns,
        use_columns=use_columns,
        use_activities=use_activities,
        scale=True,
        with_feature_selection=False)

df_target, df_target_labels = read_and_filter_dataset(
        'scott-final-iter1',
        'xdk_1',
        use_features=use_features,
        force_columns=force_columns,
        use_columns=use_columns,
        use_activities=use_activities,
        scale=True,
        with_feature_selection=False)

ppl = build_pipeline(df_source, df_source_labels, scale=True)
predicted = ppl.predict(df_target)
probs = np.max(ppl.predict_proba(df_target), axis=1)
df = pd.DataFrame({
    'predicted': predicted,
    'probs': probs,
    'actual': df_target_labels['label']
})
subdf = df.loc[df.probs == 1.0]
len(df.loc[df.actual == df.predicted]) / len(df)

0.45984455958549225

In [172]:
from scipy.spatial.distance import euclidean

labels = df_source_labels['label'].values

for i, row in enumerate(df_target.values):
    predicted = ppl.predict([row])[0]
    source_data = df_source.loc[df_source.index.isin(df_source_labels.loc[df_source_labels.label == predicted].index)]
    distances = [euclidean(row, row_t) for row_t in source_data.values]
    print(labels[i] == predicted,
          labels[i],
          predicted,
        np.mean(distances),
        np.var(distances)
         )

True 3 3 3.00254303469 8.72080081794
True 3 3 2.99301399604 8.7093622837
True 3 3 2.98484211291 8.69793704353
True 3 3 2.97621235216 8.6841336823
True 3 3 2.96833871893 8.66966521797
True 3 3 2.96011882103 8.65184163691
True 3 3 2.94935392859 8.62598098198
True 3 3 2.94241593148 8.60686276998
True 3 3 2.93812986563 8.59378894904
True 3 3 2.92878446403 8.562738228
True 3 3 2.92192809975 8.5361238235
True 3 3 2.91369547119 8.50151242634
True 3 3 2.90666410008 8.46843988312
True 3 3 2.90558847318 8.46285110505
True 3 3 2.89883787197 8.42514743941
True 3 3 2.89127126646 8.37626518298
True 3 3 2.88610575114 8.33985099146
True 3 3 2.88192797294 8.30763312576
True 3 3 2.88020426036 8.2926192791
True 3 3 2.87461892201 8.24106550847
True 3 3 2.87035981529 8.19344978152
True 3 3 2.86149619288 8.07668364497
True 3 3 2.86558853901 8.13481186285
True 3 3 2.86097196367 8.06843662192
True 3 3 2.85790414985 8.01429482853
True 3 3 2.8558231412 7.97054511836
True 3 3 2.85159526066 7.86261785483
True 3 3

True 9 9 3.30943315229 9.68685799989
True 9 9 3.29952696805 9.68114012577
False 9 16 2.27965243598 6.42892994729
True 9 9 3.23475742407 9.59108180478
True 9 9 3.1616308741 9.22566423675
True 9 9 3.18358834509 9.3863745192
True 9 9 3.25105904815 9.62471210095
True 9 9 3.20633483581 9.49796050991
True 9 9 3.19859275341 9.46445849166
True 9 9 3.19178230901 9.43118509085
True 9 9 3.18609661104 9.40101295471
True 9 9 3.21245835811 9.52220795878
True 9 9 3.14973778289 9.09846188942
True 9 9 3.16898692304 9.28706708927
True 9 9 3.16352934321 9.24271390662
True 9 9 3.15908647446 9.20173682021
True 9 9 3.18172450452 9.3755434791
True 9 9 3.13569147901 8.8440937178
True 9 9 3.14722019726 9.06581692725
True 9 9 3.14427566685 9.02307022378
True 9 9 3.14132235954 8.9750257694
True 9 9 3.15595700366 9.17052303771
True 9 9 3.12991426148 8.52448983545
True 9 9 3.13447418847 8.80661976582
False 9 1 2.38158093129 5.06862606854
False 9 8 2.98395980335 7.37214008104
False 9 8 2.86615995977 7.19694314355
F

True 3 3 3.28978314931 8.71511854167
True 3 3 3.19449611764 8.58386125025
True 3 3 3.20221694452 8.60373705415
True 3 3 3.2092117125 8.62053595861
True 3 3 3.21974785464 8.64268165026
True 3 3 3.22784450194 8.65718381072
True 3 3 3.23780021241 8.67285026715
True 3 3 3.24705742761 8.68463379512
True 3 3 3.25830193654 8.69620161248
True 3 3 3.27104985941 8.70625013652
True 3 3 3.27913260357 8.71078688712
False 2 3 3.29049198379 8.71542110518
True 2 2 2.34520264603 4.41127726798
True 2 2 2.34588272344 4.45403354474
True 2 2 2.34706897637 4.50364153458
True 2 2 2.34838975943 4.54184713574
True 2 2 2.35004450457 4.5795431204
True 2 2 2.35209330108 4.61811119534
True 2 2 2.35404584458 4.65167933409
True 2 2 2.35664091677 4.69080573059
True 2 2 2.35892182582 4.72375663751
False 2 20 1.62500802768 2.21848449751
True 2 2 2.3650160181 4.79446785392
True 2 2 2.36831649325 4.82729638372
True 2 2 2.37118689099 4.85526979334
True 2 2 2.37527142686 4.89532410025
True 2 2 2.37845443432 4.92371959205
T

False 9 8 2.76576769784 7.2254247769
True 9 9 3.22956714351 9.45334029915
True 9 9 3.13392096181 8.96799857754
True 9 9 3.13813443581 9.01907580875
True 9 9 3.1417474342 9.0580366041
False 9 8 2.80591183665 7.30106098088
True 9 9 3.45116600658 9.65906057568
True 9 9 3.15588387518 9.17682466678
True 9 9 3.16178696331 9.21355463301
True 9 9 3.16806760905 9.24899860039
True 9 9 3.17398676644 9.27894719541
True 9 9 3.11838967476 8.60138959398
True 9 9 3.71918446806 9.70652723588
True 9 9 3.52856173443 9.67372323642
True 9 9 3.20216914936 9.38330348456
False 9 2 10.6823413947 5.87251037423


In [14]:
def data_for_label(label, df, df_labels):
    return df.loc[df.index.isin(df_labels.loc[df_labels.label == label].index)]

for label in df_target_labels['label'].unique():
    target_data = data_for_label(label, df_target, df_target_labels)
    target_data = shuffle(target_data.values)
    print(len([1 for l in ppl.predict(target_data) if l == label]) / len(target_data))

ValueError: X has 3774 features per sample, expected 12

In [None]:
accuracy_score(df_target_labels['label'], ppl.predict(df_target))

In [59]:
use_features = 'microphone'
use_activities = [configuration['activities'].index(a) for a in configuration['activity_sets'][0]]

use_columns = [
    'microphone__mean_second_derivate_central',
    'microphone__mean_abs_change_quantiles__qh_0.8__ql_0.2',
    'microphone__symmetry_looking__r_0.45',
    'microphone__mean_abs_change_quantiles__qh_0.2__ql_0.6',
    'microphone__autocorrelation__lag_7',
    'microphone__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_3__w_10',
    'microphone__autocorrelation__lag_5',
    'microphone__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_0__w_5',
    'microphone__large_standard_deviation__r_0.35000000000000003',
    'microphone__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_11__w_2',
    'microphone__symmetry_looking__r_0.8500000000000001',
    'microphone__augmented_dickey_fuller',
    'microphone__number_peaks__n_3',
    'microphone__mean_abs_change_quantiles__qh_0.4__ql_0.8',
    'microphone__number_peaks__n_1',
    'microphone__symmetry_looking__r_0.4',
    'microphone__standard_deviation',
    'microphone__symmetry_looking__r_0.5',
    'microphone__large_number_of_peaks__n_1',
    'microphone__longest_strike_above_mean',
    'microphone__ar_coefficient__k_10__coeff_4',
    'microphone__has_duplicate_min',
]

df_source, df_source_labels = read_and_filter_dataset(
        'synergy-final-iter1',
        '128.237.246.127',
        use_features=use_features,
        use_activities=use_activities,
        scale=True,
        with_feature_selection=False)

df_target, df_target_labels = read_and_filter_dataset(
        'scott-final-iter1',
        '128.237.248.186',
        use_features=use_features,
        use_activities=use_activities,
        scale=True,
        with_feature_selection=False)
df_target = df_target[df_source.columns]

ppl = build_pipeline(df_source, df_source_labels, scale=True, clf_name='LogisticRegression')
predicted = ppl.predict(df_target)
accuracy_score(df_target_labels['label'], predicted)

0.15025906735751296

In [60]:
cm = pd.DataFrame(confusion_matrix(labeled_true, labeled_pred))
cm.columns = configuration['activity_sets'][0]
cm.index = configuration['activity_sets'][0]
cm

Unnamed: 0,Dishes,Microwave,Coffee,Null,Faucet,Kettle,Chopping food,Conversation,Eating popcorn,Making popcorn in microwave,Phone vibrating
Dishes,34,10,0,22,0,0,2,0,0,1,4
Microwave,0,16,1,5,0,12,3,0,2,0,0
Coffee,2,0,15,3,33,35,2,2,2,0,0
Null,66,0,0,3,0,0,3,0,1,0,0
Faucet,1,1,9,0,25,27,6,2,4,0,0
Kettle,62,0,0,7,0,2,2,0,0,0,0
Chopping food,0,25,0,7,0,1,2,0,0,6,32
Conversation,0,0,11,2,27,5,0,6,12,0,0
Eating popcorn,0,32,0,4,0,0,2,0,1,5,17
Making popcorn in microwave,0,17,0,4,0,0,4,0,0,6,43


In [42]:
configuration['activity_sets'][0]

['Dishes',
 'Microwave',
 'Coffee',
 'Null',
 'Faucet',
 'Kettle',
 'Chopping food',
 'Conversation',
 'Eating popcorn',
 'Making popcorn in microwave',
 'Phone vibrating']