Load up experiment & annotations

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

import file_handling
from indoor_positioning import get_beacons_for_proximity_approach, get_file_as_data_frame

%load_ext autoreload
%autoreload 2

from classification.classification import classify_all
from data_reading.phyphox import read_experiment
from features import extract_timeseries_features
from file_handling import get_sub_directories
from preprocessing import align_data, segment_windows, merge_left_and_right_chunk, set_time_delta_as_index
from visualization import plot_duration_histogram


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/mp-mental-health/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
experiment_dir_path = "../../data/phyphox/full recordings/"
experiment_dirs = get_sub_directories(experiment_dir_path)
# complete_experiments_indices = [1,2,3,5,7]
# experiment_dirs = [experiment_dirs[i] for i in complete_experiments_indices]
sample_rate = 50
chunks = {"right": [], "left": [], "indoor": []}
null_chunks = {"right": [], "left": [], "indoor": []}
y_columns = ["start", "end", "label", "hand"]
y = pd.DataFrame(columns=y_columns)
#del experiment_dirs[1]
for directory in experiment_dirs:
    offsets = {}

    try:
        with open(directory + "/offset.txt") as f:
            for line in f:
                (key, val) = line.split(": ")
                offsets[key.lower()] = val
    except FileNotFoundError as e:
        continue

    data_frames = read_experiment(directory, offsets=offsets)
    data_frames = {key : align_data(data_frame, listening_rate=1000/sample_rate, reference_sensor=None) for key, data_frame in data_frames.items()}
    
    try:
        indoor_file = file_handling.get_file_names_in_directory_for_pattern(directory, "*.json")[0]
        indoor_data_frame = get_file_as_data_frame(indoor_file)
        
        # filter out incorrect placed beacons
        indoor_data_frame = indoor_data_frame[indoor_data_frame["minor"] != 2]
        indoor_data_frame = indoor_data_frame[indoor_data_frame["minor"] != 10]
        
        new_df = get_beacons_for_proximity_approach(indoor_data_frame)
        indoor_data_frame = new_df
        indoor_data_frame = set_time_delta_as_index(indoor_data_frame, origin_timestamp_unit='ms',
                                             output_timestamp_unit="milliseconds",
                                             timestamp_key="timestamp")
        indoor_data_frame.sort_index(inplace=True)
        # TODO: filter out minor 2 and 10 for now
        # TODO: align needs to be done on aggregated data
        # TODO: do we really need alignment -> for now yes
        data_frames["indoor"] = align_data(indoor_data_frame, interpolation_method="previous", listening_rate=1000/sample_rate, reference_sensor=None)
        del indoor_data_frame
        del new_df
        del indoor_file
    except IndexError:
        # we don't have an indoor recording for this recording session
        continue

    y_user = pd.read_csv(directory + "/annotations.tsv", delimiter="\t", header=None)
    hands = pd.read_csv(directory + "/hands.tsv", delimiter="\t", header=None)
    y_user = y_user.iloc[:, [3,5,8]]
    hands = hands.iloc[:, [8]]
    y_user = pd.concat([y_user, hands], axis=1)
    y_user.columns = y_columns
    y = pd.concat([y, y_user], axis=0)

    # iterate over the annotations and split the timeseries in chunks
    for key, df in data_frames.items():
        if key in chunks:
            chunks[key] += [df.iloc[int(annotation["start"]*sample_rate):int(annotation["end"]*sample_rate)] for i, annotation in y_user.iterrows()]
            # null chunks are everything in between annotations
            null_chunks[key] += [df.iloc[int(annotation["end"]*sample_rate):int(y_user.iloc[i+1:i+2]["start"]*sample_rate)] for i, annotation in y_user.iterrows() if i < len(y_user)-1]
            
    break

  time_delta_index = pd.TimedeltaIndex(timestamp_to_date, unit=output_timestamp_unit)


In [None]:
experiment_dirs

Preprocess data

In [None]:
plot_duration_histogram(chunks["right"])

In [None]:
plot_duration_histogram(null_chunks["right"])

In [3]:
# append the activity label (as int) and the action id to the dataframe
# we need to do this to be able to extract time series features later

labels = y.loc[:, "label"].unique()
label_ids = { l: i for l,i in zip(labels, range(0,len(labels))) }

# list of tuples (left chunk, right chunk)
chunks_two_handed = []
# list of chunks
chunks_one_handed = []

y = y.replace(label_ids)

for i, cl in zip(range(len(y)), y.iterrows()):
    label_id = int(cl[1]["label"])
    action_id = i
    two_handed_chunk = []
    for hand, current_chunk_data_list in chunks.items():
        
        # TODO: handle indoor here
        if hand == "indoor":
            continue

        current_chunk = current_chunk_data_list[i]
        chunk_hand = cl[1]["hand"]
        two_handed_chunk.append(current_chunk)
        one_handed_chunk = current_chunk
        one_handed_chunk["action_id"] = action_id
        if chunk_hand == hand:
            #c["activity"] = label_id
            chunks_one_handed.append(one_handed_chunk)
    two_handed_chunk = merge_left_and_right_chunk(two_handed_chunk[0], two_handed_chunk[1], action_id)
    chunks_two_handed.append(two_handed_chunk)

labels = y.loc[:, "label"].squeeze()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Feature extraction for 2 handed activities

In [4]:
window_size = 100

In [5]:
# prepare null chunks
null_class_chunks = []

# TODO: assert that this list is disjoint to the list of action ids from activities
null_action_ids = range(len(chunks_two_handed),len(chunks_two_handed)+len(null_chunks["right"]))
for c_r, c_l, action_id in zip(null_chunks["right"], null_chunks["left"], null_action_ids):
    if len(c_l):
        c_both = merge_left_and_right_chunk(c_l, c_r, action_id)
        null_class_chunks.append(c_both)

In [None]:
chunks_two_handed[1]

## Classification step 1: OCD activities vs null class samples

In [6]:
# new label id for ocd activities
labels_ocd_acts = pd.Series([labels.max()+2] * len(chunks_two_handed))
chunks_ocd_activities, labels_ocd_acts = segment_windows(chunks_two_handed, labels_ocd_acts.to_numpy(), window_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_new["action_id"] = [(action_id, i)] * len(c_new)


In [None]:
chunks_ocd_activities[9]

In [7]:
# TODO: add indoor to segment_windows
null_labels = pd.Series([labels.max()+1] * len(null_class_chunks))
null_class_chunks, null_labels = segment_windows(null_class_chunks, null_labels.to_numpy(), window_size)


In [None]:
null_class_chunks[9]

# TODO: assert that len(null_class_chunks.columns) == len(chunks_ocd_activities.columns)

### Feature extraction for OCD activities vs non-OCD activities

In [9]:
null_classification_concat = pd.concat(chunks_ocd_activities + null_class_chunks).reset_index(drop=True)



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [None]:
features_two_handed_null_test = extract_timeseries_features(null_classification_concat)


### Feature selection for OCD activities vs non-OCD activities

In [None]:
labels_null_classification = pd.concat([labels_ocd_acts, null_labels])

In [None]:
labels_null_classification

In [None]:
impute(features_two_handed_null_test)
X_two_handed_selected_null_test = select_features(features_two_handed_null_test, labels_null_classification)
# Add indoor
X_two_handed_selected_null_test

In [None]:
# TODO: feature visualization: scatter plot - explain what happens

In [None]:
scaler = StandardScaler()
X_null_classification = scaler.fit_transform(X_two_handed_selected_null_test)

### Train models and score results

In [None]:
print("Two handed classification")
classify_all(X_null_classification, labels_null_classification)

In [None]:
# TODO: plot feature importance (which feature has the most impact on the results)

sns.pairplot(X_null_classification[:, :10])


In [None]:
chunks_two_handed_segmented, labels = segment_windows(chunks_two_handed, labels.to_numpy(), window_size)


### Feature extraction for 2 handed classifier of activities against each other

In [None]:
pd.concat(chunks_two_handed).reset_index(drop=True)

In [None]:
features_two_handed = extract_timeseries_features(pd.concat(chunks_two_handed).reset_index(drop=True))

In [None]:
features_two_handed.replace([np.inf, -np.inf], np.nan, inplace=True)
features_two_handed.dropna(inplace=True, axis=1)
features_two_handed

### Feature selection for 2 handed classifier of activities against each other

In [None]:
# Off-the-shelve feature selection from tsfresh

impute(features_two_handed)
X_two_handed_selected = select_features(features_two_handed, pd.Series(labels))
X_two_handed_selected

In [None]:
labels

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X_two_handed_selected)

In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_two_handed_selected = sel.fit_transform(X)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X_two_handed_selected = SelectKBest(f_classif, k=2000).fit_transform(X, labels)
X_two_handed_selected

Train models and score results

In [None]:
print("Two handed classification")
classify_all(X, labels, label_ids)