In [569]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from tsfresh import select_features

In [570]:
dataset_path = '../datasets/synergy-10-min-features/'

target = 'Matrix b827eb96f31a'
# target = 'Matrix b827ebe6e0f8'
source = '128.237.248.186'
# target = '128.237.246.127'

df_train = pd.read_pickle(dataset_path + source + '.p')
df_test = pd.read_pickle(dataset_path + target + '.p')
df_labels = pd.DataFrame.from_csv(dataset_path + 'activity_labels.csv')

In [571]:
features = "microphone"
df_train = df_train.filter(regex=(features))
df_test = df_test.filter(regex=(features))

In [572]:
df_train = select_features(df_train, df_labels['label'])
df_test = df_test[df_train.columns]

Feature Selection: 100%|██████████| 222/222 [00:00<00:00, 2684.04it/s]


In [573]:
def test(training_data_ratio):
    # train test split
    indexes = df_labels.index.tolist()
    np.random.shuffle(indexes)
    split = round(0.7*len(indexes))

    training_data_split = round(training_data_ratio * split)

    training_i = indexes[:training_data_split]
    testing_i = indexes[split:]

    X_train = df_train.loc[df_train.index.isin(training_i)]
    X_test = df_test.loc[df_test.index.isin(testing_i)]

    y_train = df_labels.loc[df_labels.index.isin(training_i)]
    y_test = df_labels.loc[df_labels.index.isin(testing_i)]
    y_train = y_train['label']
    y_test = y_test['label']

    X_train = X_train.reindex_axis(sorted(X_train.columns), axis=1)
    X_test = X_test.reindex_axis(sorted(X_test.columns), axis=1)

    ppl = Pipeline([
        ('impute', Imputer()),
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier())
    ])

    ppl.fit(X_train, y_train)
    y_pred = ppl.predict(X_test)

    return accuracy_score(y_test, y_pred)

In [574]:
for i in range(10):
    training_data_ratio = (i + 1) / 10.0
    accuracies = []
    for _ in range(20):
        accuracy = test(training_data_ratio)
        accuracies.append(accuracy)
    print(str(np.average(accuracies)))

0.216904761905
0.206428571429
0.207142857143
0.221666666667
0.207857142857
0.197380952381
0.182619047619
0.199285714286
0.172619047619
0.187142857143
