### Importing libraries

In [1]:
import sklearn.datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
import os
import platform
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import pathlib
import pickle
from itertools import product
from scipy.stats import skew, kurtosis, pearsonr, iqr, zscore
from scipy.signal import butter, welch, filtfilt, resample
import time
import re
import copy
import math
from sklearn import preprocessing
import scipy.io  # test matlab code

%matplotlib inline

### Functions

#### Selecting subject filepath

In [2]:
def select_subject(sub):

    path = os.path.join(
        r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\MC10 Study\Data\biostamp_data\cva', sub)
    return path

#### Obtaining PT timestamps dataframe from annotations file

In [3]:
def read_annotations():

    df = pd.read_csv(os.path.join(path, 'annotations.csv'))

    del df['Timestamp (ms)']
    del df['AnnotationId']
    del df['AuthorId']
    del df['Value']

    df = df[df['EventType'].str.match('Physical Therapy')]
    df['Session'] = df.groupby('EventType')['Start Timestamp (ms)'].rank(
        ascending=True).astype(int)

    del df['EventType']

    df = df.reset_index(drop=True).set_index('Session')

    df.insert(0, 'date', pd.to_datetime(df['Start Timestamp (ms)'], unit='ms'))
    df.date = df.date.dt.date
    df = df.drop_duplicates(subset='Start Timestamp (ms)')

    return df

#### Reading data, extracting PT intervals, indexing by time (s)

In [4]:
def read_data(df):

    locations = [locs for locs in os.listdir(
        path) if os.path.isdir(os.path.join(path, locs))]

    accel = {locs: pd.DataFrame() for locs in locations}
    gyro = {locs: pd.DataFrame() for locs in locations}
    elec = {locs: pd.DataFrame() for locs in locations}

    for root, dirs, files in os.walk(path, topdown=True):
        for filenames in files:
            if filenames.endswith('accel.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                accel[location] = accel[location].append(temp_df)

            elif filenames.endswith('gyro.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                # To avoid issues when saving to mat file:
                temp_df = temp_df.rename(columns={'Gyro X (°/s)': 'Gyro X (degree/s)',
                                                  'Gyro Y (°/s)': 'Gyro Y (degree/s)', 'Gyro Z (°/s)': 'Gyro Z (degree/s)'})
                gyro[location] = gyro[location].append(temp_df)

            elif filenames.endswith('elec.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                location = str(p.relative_to(path)).split("\\")[0]
                temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
                elec[location] = elec[location].append(temp_df)

    startSize = len(df.index)

    if startSize == 1:
        startTimestamp = df.loc[1, 'Start Timestamp (ms)']
        endTimestamp = df.loc[1, 'Stop Timestamp (ms)']
    else:
        startTimestamp = df.loc[1:startSize, 'Start Timestamp (ms)'].values
        endTimestamp = df.loc[1:startSize, 'Stop Timestamp (ms)'].values

    # Create trial dictionary with each key containing all sensor data related with each activity's trial
    trial_dict = {trials: pd.DataFrame()
                  for trials in range(0, np.size(startTimestamp))}

    # Populate trial directory keys
    for trials in range(0, np.size(startTimestamp)):

        if startSize == 1:
            startTime = startTimestamp
            endTime = endTimestamp
        else:
            startTime = startTimestamp[trials]
            endTime = endTimestamp[trials]

        # Create sensor location dictionary with each key corresponding to sensor locations
        sensor_dict = {locs: pd.DataFrame() for locs in locations}

        # Extract sensor data and populate sensor_dict with sensor data
        for location in locations:

            data = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(),
                    'elec': pd.DataFrame()}

            if not accel[location].empty:
                accelData = accel[location]
                data['accel'] = accelData[(accelData.index >= startTime) & (
                    accelData.index <= endTime)]

            if not gyro[location].empty:
                gyroData = gyro[location]
                data['gyro'] = gyroData[(gyroData.index >= startTime) & (
                    gyroData.index <= endTime)]

            if not elec[location].empty:
                elecData = elec[location]
                data['elec'] = elecData[(elecData.index >= startTime) & (
                    elecData.index <= endTime)]

            sensor_dict[location] = data

        trial_dict[trials] = sensor_dict

    for trials in range(0, np.size(startTimestamp)):
        for location in locations:
            if not trial_dict[trials][location]['accel'].empty:
                trial_dict[trials][location]['accel'].index = (
                    (trial_dict[trials][location]['accel'].index-(trial_dict[trials][location]['accel'].index.values[0]))/1000)
                trial_dict[trials][location]['accel'].index.names = [
                    'Timestamp (s)']

            if not trial_dict[trials][location]['gyro'].empty:
                trial_dict[trials][location]['gyro'].index = (
                    (trial_dict[trials][location]['gyro'].index-(trial_dict[trials][location]['gyro'].index.values[0]))/1000)
                trial_dict[trials][location]['gyro'].index.names = [
                    'Timestamp (s)']

            if not trial_dict[trials][location]['elec'].empty:
                trial_dict[trials][location]['elec'].index = (
                    (trial_dict[trials][location]['elec'].index-(trial_dict[trials][location]['elec'].index.values[0]))/1000)
                trial_dict[trials][location]['elec'].index.names = [
                    'Timestamp (s)']

    return trial_dict

#### Creating a raw data & metadata dataframe

In [5]:
def raw_and_meta(data):

    info = pd.DataFrame([(k1, k2, k3, v) for k1, k23v in data.items()
                         for k2, k3v in k23v.items()
                         for k3, v in k3v.items()])

    info.columns = ['session', 'location', 'sensor', 'rawdata']
    info['session'] = info['session']+1

    info.insert(0, 'subject', subject)
    info.insert(1, 'date', '')

    for i in range(0, len(info.index)):
        for j in range(0, len(timestamps.index)):
            if info['session'][i] == timestamps.index[j]:
                info['date'][i] = timestamps['date'][j+1]

    return info

#### Reading PT CSVs for specific activity timestamps

In [6]:
def read_activities():

    to_delete = list()
    iteration = list()

    fr = pd.read_csv(os.path.join(act_path, subject + '.csv'))
#     fr = fr.dropna(axis=1, how='all')
    fr = fr.dropna(axis=0, how='all').reset_index()
    del fr['index']
    fr['Session'] = fr['Session'].astype(int)
    fr = fr.rename(columns={'Running Time': 'Start Timestamp (s)', 'Running time': 'Start Timestamp (s)',
                            'Stop Time': 'Stop Timestamp (s)', 'Stop time': 'Stop Timestamp (s)', 'Activities': 'Activity'})

    act_count = {entry: 0 for entry in fr.Activity.unique()}
    past_steps = 0

    for item in range(0, len(fr)):
        if item != 0 and fr['Session'][item] != fr['Session'][item-1]:
            act_count = {entry: 0 for entry in fr.Activity.unique()}
            past_steps = 0

        if not math.isnan(fr['Steps'][item]):
            step_temp = fr['Steps'][item]
            fr['Steps'][item] = fr['Steps'][item] - past_steps
            past_steps = step_temp
        if (sum([a*b for a, b in zip([3600, 60, 1], map(int, fr['Stop Timestamp (s)'][item].split(':')))]) -
            sum([a*b for a, b in zip([3600, 60, 1], map(int, fr['Start Timestamp (s)'][item].split(':')))]) <= 20):
            to_delete.append(item)
        else:
            act_count[fr['Activity'][item]] += 1

        iteration.append(act_count[fr['Activity'][item]])

        fr.loc[item, 'Activity'] = fr.loc[item, 'Activity'].replace(" ", "_")
        fr.loc[item, 'Activity'] = fr.loc[item, 'Activity'].replace("&", "and")
        fr.loc[item, 'Activity'] = fr.loc[item, 'Activity'].replace("_-_", "_")
        fr.loc[item, 'Activity'] = fr.loc[item, 'Activity'].replace("-", "_")

        seg_date = fr.Date[item].split('/')
        if (len(seg_date[1]) < 2):
            seg_date[1] = '0' + seg_date[1]
        if (len(seg_date[0]) < 2):
            seg_date[0] = '0' + seg_date[0]
        fr.Date[item] = seg_date[2] + '/' + seg_date[0] + '/' + seg_date[1]

    fr['Iteration'] = iteration
    fr = fr.drop(to_delete)
    fr = fr.reset_index(drop=True)
    return fr

#### Segmenting by activity and formatting for Matlab

In [7]:
def segment_activities(fr):

    sessions = fr.Session.unique()

    locations = [locs for locs in os.listdir(
        path) if os.path.isdir(os.path.join(path, locs))]
    segmented = {('ymd' + fr[(fr['Session'] == session)]['Date'].iloc[0].replace(
        "/", "_")): pd.DataFrame() for session in sessions}

    for session in sessions:
        acts = fr[(fr.Session == session)]['Activity'].unique()
        activities = {activity: pd.DataFrame() for activity in acts}

        for activity in acts:
            count = fr[(fr.Session == session) & (
                fr['Activity'] == activity)]['Iteration'].values
            iterations = {('event_' + str(iteration)): pd.DataFrame for iteration in count}

            for iteration in count:

                event = fr.index[(fr['Session'] == session) & (
                    fr['Activity'] == activity) & (fr['Iteration'] == iteration)][0]

                startTime = fr[(fr['Session'] == session)
                               ]['Start Timestamp (s)']
                startTime = sum(
                    [a*b for a, b in zip([3600, 60, 1], map(int, startTime[event].split(':')))])

                endTime = fr[(fr['Session'] == session)]['Stop Timestamp (s)']
                endTime = sum(
                    [a*b for a, b in zip([3600, 60, 1], map(int, endTime[event].split(':')))])

                sensor_dict = {locs: pd.DataFrame() for locs in locations}

                for location in locations:

                    mode = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(),
                            'elec': pd.DataFrame()}

                    if not data[session-1][location]['accel'].empty:
                        accelData = data[session-1][location]['accel'][(data[session-1][location]['accel'].index >= startTime) & (
                            data[session-1][location]['accel'].index <= endTime)]

                        if not accelData.empty:
                            acx = np.transpose(np.column_stack(
                                (k, v) for k, v in accelData['Accel X (g)'].items()))
                            acxy = np.column_stack(
                                (acx, accelData['Accel Y (g)'].values))
                            mode['accel'] = np.column_stack(
                                (acxy, accelData['Accel Z (g)'].values))

                    if not data[session-1][location]['gyro'].empty:
                        gyroData = data[session-1][location]['gyro'][(data[session-1][location]['gyro'].index >= startTime) & (
                            data[session-1][location]['gyro'].index <= endTime)]

                        if not gyroData.empty:
                            gyx = np.transpose(np.column_stack(
                                (k, v) for k, v in gyroData['Gyro X (degree/s)'].items()))
                            gyxy = np.column_stack(
                                (gyx, gyroData['Gyro Y (degree/s)'].values))
                            mode['gyro'] = np.column_stack(
                                (gyxy, gyroData['Gyro Z (degree/s)'].values))

                    if not data[session-1][location]['elec'].empty:
                        elecData = data[session-1][location]['elec'][(data[session-1][location]['elec'].index >= startTime) & (
                            data[session-1][location]['elec'].index <= endTime)]

                        if not elecData.empty:
                            mode['elec'] = np.transpose(np.column_stack(
                                (k, v) for k, v in elecData['Sample (V)'].items()))

                    sensor_dict[location] = mode

                sensor_dict["steps"] = fr[(fr['Session'] == session) & (fr['Activity'] == activity) & (
                    fr['Iteration'] == iteration)]['Steps'].values[0]
                iterations[('event_' + str(iteration))] = sensor_dict

            activities[activity] = iterations

        segmented[('ymd' + fr[(fr['Session'] == session)]
                   ['Date'].iloc[0].replace("/", "_"))] = activities

    return segmented

### Running the functions

In [8]:
act_path = r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\MC10 Study\Outcome Measures\CVA_PT_activities'
subs = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14',
           '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28',
           '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42',
           '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55']

# subs = ['12']

for n in range(0, len(subs)):
    subject = 'CVA'+subs[n]
    path = select_subject(subject)
    timestamps = read_annotations()
    data = read_data(timestamps)  # this one takes a while
    # raw = raw_and_meta(data)
    df = read_activities()
    segmented = segment_activities(df)
#     physical_therapy[subject] = segmented
    scipy.io.savemat(subject, segmented, long_field_names=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a