In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

**Load data**

In [29]:
dataset_path = '../../datasets/synergy-kitchen-mites-features/'

source = '128.237.253.157'
target = '128.237.242.0'

df_source = pd.read_pickle(dataset_path + source + '.p')
df_target = pd.read_pickle(dataset_path + target + '.p')
df_labels = pd.DataFrame.from_csv(dataset_path + 'activity_labels.csv')

**Filter features to be used**

In [30]:
df_source = df_source.filter(regex=("ACCEL_sst_*"))
df_target = df_target.filter(regex=("ACCEL_sst_*"))

**Do the easy domain adaptation trick – add source_X and target_X columns**

In [22]:
for column in df_source.columns.tolist():
    df_source['source_' + column] = df_source[column]
    df_source['target_' + column] = 0

for column in df_target.columns.tolist():
    df_target['source_' + column] = 0
    df_target['target_' + column] = df_target[column]

**Split into training and testing set**

In [61]:
TRAINING_TEST_SPLIT = 0.7

indexes = df_labels.index.tolist()
np.random.shuffle(indexes)

split = round(TRAINING_TEST_SPLIT * len(indexes))
all_training_i = indexes[:split]

testing_i = indexes[split:]

**Split training into source-domain training and target-domain training**

In [62]:
TARGET_TRAINING_DATA_TO_USE = 0.4

target_training_len = round(TARGET_TRAINING_DATA_TO_USE * len(all_training_i))
target_training_i = all_training_i[:target_training_len]

max_i = df_labels.index.max()

X_train_source = df_source.loc[df_source.index.isin(all_training_i)]
X_train_target = df_target.loc[df_target.index.isin(target_training_i)]

X_train_target.index += max_i + 1
X_train = pd.concat([X_train_source, X_train_target])

y_train_source = df_labels.loc[df_labels.index.isin(all_training_i)]
y_train_target = df_labels.loc[df_labels.index.isin(target_training_i)]

y_train_target.index += max_i + 1
y_train = pd.concat([y_train_source, y_train_target])

**Create the inputs for classifier**

In [63]:
X_test = df_target.loc[df_target.index.isin(testing_i)]
y_test = df_labels.loc[df_labels.index.isin(testing_i)]

y_train = y_train['label']
y_test = y_test['label']

# sort the columns
X_train = X_train.reindex_axis(sorted(X_train.columns), axis=1)
X_test = X_test.reindex_axis(sorted(X_test.columns), axis=1)

**Classify and test**

In [64]:
ppl = Pipeline([
    ('impute', Imputer()),
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier())
])

ppl.fit(X_train, y_train)
y_pred = ppl.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

if not 'accuracies' in locals():
    accuracies = []

accuracies.append([accuracy, TRAINING_TEST_SPLIT, TARGET_TRAINING_DATA_TO_USE])

for accuracy in accuracies:
    print(str(accuracy[0]) + ' TS split: ' + str(accuracy[1]) + ' TST split ' + str(accuracy[2]))

0.1875 TS split: 0.7 TST split 0.8
0.975 TS split: 0.7 TST split 0.3
0.925 TS split: 0.7 TST split 0.1
0.9625 TS split: 0.7 TST split 0.9
0.95 TS split: 0.7 TST split 0.4


In [228]:
accuracies = []