Load up experiment & annotations

In [1]:
import pandas as pd
import numpy as np
from data_reading.phyphox import read_experiment
from preprocessing._interpolation import align_data
from file_handling import get_sub_directories

experiment_dir_path = "../../data/phyphox/short recordings/"
experiment_dirs = get_sub_directories(experiment_dir_path)

sample_rate = 50
chunks = {"right": [], "left": []}
y_columns = ["start", "end", "label", "hand"]
y = pd.DataFrame(columns=y_columns)
#del experiment_dirs[1]
for dir in experiment_dirs[1:3]:
    offsets = {}
    with open(dir + "/offset.txt") as f:
        for line in f:
           (key, val) = line.split(": ")
           offsets[key] = val

    data_frames = read_experiment(dir, offsets=offsets)
    data_frames = {key : align_data(data_frame, listening_rate=1000/sample_rate, reference_sensor=None) for key, data_frame in data_frames.items()}

    y_user = pd.read_csv(dir + "/annotations.tsv", delimiter="\t", header=None)
    hands = pd.read_csv(dir + "/hands.tsv", delimiter="\t", header=None)
    y_user = y_user.iloc[:, [3,5,8]]
    hands = hands.iloc[:, [8]]
    y_user = pd.concat([y_user, hands], axis=1)
    y_user.columns = y_columns
    y = pd.concat([y, y_user], axis=0)

    # iterate over the annotations and split the timeseries in chunks
    for key, df in data_frames.items():
        chunks[key] += [df.iloc[int(annotation["start"]*sample_rate):int(annotation["end"]*sample_rate)] for i, annotation in y_user.iterrows()]

  time_delta_index = pd.TimedeltaIndex(timestamp_to_date, unit=output_timestamp_unit)


Preprocess data

In [2]:
# append the activity label (as int) and the action id to the dataframe
# we need to do this to be able to extract time series features later

labels = y.loc[:, "label"].unique()
label_ids = { l: i for l,i in zip(labels, range(0,len(labels))) }

# list of tuples (left chunk, right chunk)
chunks_two_handed = []
# list of chunks
chunks_one_handed = []

y = y.replace(label_ids)


for i, cl in zip(range(len(y)), y.iterrows()):
    label_id = int(cl[1]["label"])
    action_id = i
    two_handed_chunk = []
    for hand, chunk_list in chunks.items():
        c = chunk_list[i]
        chunk_hand = cl[1]["hand"]
        #if chunk_hand == "both":
        #c["activity"] = label_id
        #TODO for 2 phase classification: modify the label list
        #else:
             # use an id that's not yet used for another activity to label "single handed" activities
        #    c["activity"] = len(y)
        #c["activity"] = label_id
        c["action_id"] = action_id
        two_handed_chunk.append(c)
        if chunk_hand == hand:
            #c["activity"] = label_id
            chunks_one_handed.append(c)
    left_chunk = two_handed_chunk[0].reset_index()
    right_chunk = two_handed_chunk[1].reset_index(drop=True)
    right_chunk.drop(columns=["action_id"], inplace=True)
    right_chunk.columns = [str(col) + '_right' for col in right_chunk.columns]
    two_handed_chunk = pd.concat([left_chunk, right_chunk], axis=1)
    two_handed_chunk.set_index('index', inplace=True)
    chunks_two_handed.append(two_handed_chunk)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Feature extraction for 2 handed activities

In [None]:
%load_ext autoreload
%autoreload 2

labels = y.loc[:, "label"].squeeze()
from features._timeseries_feature_extraction import extract_timeseries_features
features_two_handed = extract_timeseries_features(pd.concat(chunks_two_handed).reset_index(drop=True))

Feature Extraction:  55%|█████▌    | 11/20 [01:51<01:30, 10.07s/it]

In [None]:

features_two_handed.replace([np.inf, -np.inf], np.nan, inplace=True)
features_two_handed.dropna(inplace=True, axis=1)
features_two_handed

TODO: Feature selection & visualization

Train models and score results

In [None]:
labels

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt

models = [('Logistic Regression', LogisticRegression(solver='liblinear', multi_class='ovr')), ('LDA', LinearDiscriminantAnalysis()), ('LinearSVC', LinearSVC()), ('CART', DecisionTreeClassifier()), ('NB', GaussianNB())]


def classify_all(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    for name, model in models:
        scores = cross_val_score(model, X_scaled, y, cv=5)
        print(name, scores.mean())

        # confusion matrix
        y_pred = cross_val_predict(model, X_scaled, y, cv=5)
        conf_mat = confusion_matrix(y, y_pred)
        #print(conf_mat)
        df_cm = pd.DataFrame(conf_mat, index = label_ids.keys(),
                  columns = label_ids.keys())
        df_cm["sum"] = df_cm.sum(axis=1)
        df_cm = df_cm.loc[:,label_ids.keys()].div(df_cm["sum"], axis=0)
        plt.figure(figsize = (10,7))
        sn.heatmap(df_cm, annot=True)
        plt.show()


print("Two handed classification")
classify_all(features_two_handed, labels)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_two_handed,labels, test_size=0.2)
m = models[0][1]
m.fit(X_train, y_train)
m.score(X_test, y_test)

In [None]:
label_ids