In [35]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from tsfresh import select_features

**Configuration**

In [36]:
dataset = 'synergy-final-iter2'
# device = 'Matrix b827eb96f31a'
device = 'TI SensorTag 604'
use_sensor_features = "accel_"
should_select_features = False
repeat_times = 20
train_test_ratio = 0.7
test_all_training_split_ratios = False
activities_to_use = [
    0, # 'Dishes',
    1, # 'Microwave',
    2, # 'Coffee',
    3, # 'Null',
    4, # 'Faucet',
    5, # 'Kettle',
#     6, # 'Phone ringing',
    7, # 'Chopping food',
    8, # 'Conversation',
    9, # 'Eating popcorn',
#     10, # 'Microwave door opened',
#     11, # 'Microwave door closed',
#     12, # 'Cupboard door opened',
#     13, # 'Cupboard door closed',
#     14, # 'Microwave button press',
#     15, # 'Taking ice',
    16, # 'Making popcorn in microwave',
#     17, # 'Room lights off',
#     18, # 'Knocking',
#     19, # 'Frequency sweep',
    20 # 'Phone vibrating'
]

**Load the dataset**

In [37]:
dataset_path = '../datasets/' + dataset + '-features/'
df = pd.read_pickle(dataset_path + device + '.p')
df_labels = pd.DataFrame.from_csv(dataset_path + 'activity_labels.csv')

**Filter the sensor features to use**

In [38]:
df = df.filter(regex=(use_sensor_features))

**Filter activities to use**

In [39]:
df_labels = df_labels.loc[df_labels.label.isin(activities_to_use)]
df = df.loc[df.index.isin(df_labels.index)]

**Do feature selection**

In [40]:
if should_select_features:
    df = select_features(df, df_labels['label'])

**Compute the accuracies**

In [41]:
training_data_ratios = [1]

if test_all_training_split_ratios:
    training_data_ratios = [(i + 1) / 10 for i in range(10)]

results = {}

for i in range(repeat_times):
    for training_data_ratio in training_data_ratios:
        indexes = df_labels.index.tolist()
        np.random.shuffle(indexes)
        split = round(train_test_ratio * len(indexes))

        training_data_split = round(training_data_ratio * split)

        training_i = indexes[:training_data_split]
        testing_i = indexes[split:]

        X_train = df.loc[df.index.isin(training_i)]
        X_test = df.loc[df.index.isin(testing_i)]

        y_train = df_labels.loc[df_labels.index.isin(training_i)]
        y_test = df_labels.loc[df_labels.index.isin(testing_i)]
        y_train = y_train['label']
        y_test = y_test['label']

        ppl = Pipeline([
            ('impute', Imputer()),
            ('scale', StandardScaler()),
            ('clf', RandomForestClassifier())
        ])

        ppl.fit(X_train, y_train)
        y_pred = ppl.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        key = 'Train data used: ' + str(training_data_ratio * 100) + '%'
        if key in results:
            results[key].append(accuracy)
        else:
            results[key] = [accuracy]

**Print out the accuracies**

In [42]:
for key in results:
    print(key + ': ' + str(np.average(results[key])))

Train data used: 100%: 0.88731884058
