In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

%% Constants and environment

%#ok<*NBRAK>
dataBaseDir = 'D:\temp\exercise_data_release';

% This file has *only* data during exercises, already separated out by exercise.  
% The "singleonly" in the filename means that it only contains "single-activity" traces.
% So this is useful for the "which exercise?" (recognition) and counting problems, not for 
% the "when is there exercise?" problem.
dataFileSingleActivity = fullfile(dataBaseDir,'exercise_data.50.0000_singleonly.mat');
% This file has data from complete exercise sessions, so it includes lots of time
% where there's no execise happening.  The "multionly" in the filename means that it 
% only contains "multi-activity" traces.  So this is useful for the "when are you 
% exercising?"  problem (segmentation).
dataFileMultiActivity = fullfile(dataBaseDir,'exercise_data.50.0000_multionly.mat');

% We separate these into two files to make it a little easier to work on
% one problem or the other.  You can completely reconstruct the
% "single-activity" file *from* the "multi-activity" file, because it also
% includes all the times at which each exercise started and stopped.

% In this file, we're going to play around with data from a particular 
% subject doing a particular exercise.  Not every subject did every
% exercise; I'm picking a combination that I happen to know isn't empty.
% In fact I'm also picking a subject that I happen to know participated 
% more than once.
iSubject = 52;
exerciseName = 'Two-arm Dumbbell Curl (both arms, not alternating)';


%% Load data... this will take a couple minutes; it's about 2.5GB

fprintf(1,'Loading single-activity data...\n');
exerciseDataSingleActivity = load(dataFileSingleActivity);

fprintf(1,'Loading multi-activity data...\n');
exerciseDataMultiActivity = load(dataFileMultiActivity);

fprintf(1,'Finished loading data\n');

% Both of these files have similar formats format... the data lives in a cell
% matrix called "subject_data", where each row is a subject, and each column
% is a type of exercise.  
%
% In the "multi-activity" data, there's only one column, since we haven't separated
% data out into exercises yet.

% The column names for the single-activity data are here:
activities = exerciseDataSingleActivity.exerciseConstants.activities';
nActivityTypes = length(activities);

% Sanity-check our data matrices
assert(nActivityTypes == size(exerciseDataSingleActivity.subject_data,2));

assert(1 == size(exerciseDataMultiActivity.subject_data,2));

% Both files should have the same number of subjects
nParticipants = size(exerciseDataSingleActivity.subject_data,1);
assert(nParticipants == size(exerciseDataMultiActivity.subject_data,1));


%% Find the column for a particular exercise we want to look at

exerciseIndex = find(strcmp(exerciseDataSingleActivity.exerciseConstants.activities,...
    exerciseName));


%% This matrix has one row per person, one column per exercise

% Any given cell in "subject_data" is a struct array, with one element for
% each time that subject (this row) performed that exercise (this column)
%
% So for example, the following cell contains a struct array, with one element per 
% record, of every time this subject came in to our lab and did the exercise we picked 
% above, which may span multiple visits.
recordings = exerciseDataSingleActivity.subject_data{iSubject,exerciseIndex};

% How many times did this subject do this particular exercise?
nRecordings = length(recordings);


%% Now let's look at one instance of this particular subject doing this particular exercise

% Arbitrarily grab the first instance to plot
recording = recordings(1);
assert(strcmp(recording.activityName,exerciseName));

% Plot the raw accelerometer and gyro data (at 50Hz)
accelT = recording.data.accelDataMatrix(:,1);
accelXYZ = recording.data.accelDataMatrix(:,[2:4]);
gyroT = recording.data.gyroDataMatrix(:,1);
gyroXYZ = recording.data.gyroDataMatrix(:,[2:4]);

subplot(2,1,1);
plot(accelT,accelXYZ);
xlabel('Time (seconds)');
ylabel('Accelerometer output (g)');
legend({'X','Y','Z'});

subplot(2,1,2);
plot(gyroT,gyroXYZ);
xlabel('Time (seconds)');
ylabel('Gyro output (dps)');
legend({'X','Y','Z'});


%% Now let's play around with the multi-activity data

% Remember, in this file, we haven't separated out the periods of exercise
% and non-exercise, everything is one big long trace per subject, with
% labels to tell us where exercises started and stopped.

recordings = exerciseDataMultiActivity.subject_data{iSubject,1};

% Each instance here represents a visit to our lab
nVisits = length(recordings);

% Arbitrarily grab the first visit
recording = recordings(1);

% This is a cell matrix that tells us when exercises started and stopped,
% and how many repetitions the subject did for each exercise.  The columns 
% are:
%
% [exercise name],[start time],[end time],[notes],[number of repetitions]

% So let's plot this subject's accelerometer data (just one axis, so the plot
% doesn't get too complex), with vertical lines to indicate where exercises
% started, with labels for each exercise.
nActivities = size(recording.activityStartMatrix,1);

accelT = recording.data.accelDataMatrix(:,1);
accelZ = recording.data.accelDataMatrix(:,4);

plot(accelT,accelZ);
hold on;
xlabel('Time (seconds)');
ylabel('Accelerometer output (g)');

for(iActivity=1:nActivities)
    activityName = recording.activityStartMatrix{iActivity,1};
    if (strcmpi(activityName,'non-exercise'))
        continue;
    end
    activityCount = recording.activityStartMatrix{iActivity,5};
    activityStartTime = recording.activityStartMatrix{iActivity,2};
    activityEndTime = recording.activityStartMatrix{iActivity,3};
    lineHandle = line([activityStartTime activityStartTime],[-0.5 0.5]);
    lineHandle.Color = [0 1 0];
    lineHandle = line([activityEndTime activityEndTime],[-0.5 0.5]);
    lineHandle.Color = [1 0 0];
    yValue = -0.5 + rand();
    tHandle = text(activityStartTime,yValue,sprintf('%s x %d',activityName,activityCount));    
    tHandle.Rotation = 45;
end % ...for each activity

hold off; zoom on;
xlim([100 300])


Import Libraries

In [None]:
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import random
import scipy.signal as signal
import pandas as pd
from sklearn.decomposition import PCA
import math
import warnings
warnings.filterwarnings('ignore')

## Questions
- what is "<Initial Activity>" as a an activity name?

## Load & Explore Data

### Download and save the 'exercise_data.50.0000_singleonly.mat' file from the below link and save it locally in the same folder as this notebook file. 

https://msropendata.com/datasets/799c1167-2c8f-44c4-929c-227bf04e2b9a

In [None]:
# Load exercise dataset 
exercise_dataset = scipy.io.loadmat('/Users/mani/Downloads/P/exerciserecognitionfromwearablesensors/exercise_data.50.0000_singleonly.mat', struct_as_record=False)

# Load activities and data full objects
exercise_constants = exercise_dataset['exerciseConstants'][0][0].activities
subject_data = exercise_dataset['subject_data']

# extract activities names into an array
all_activities = []
for act in exercise_constants[0]:
    all_activities.append(act[0])
    #print (act[0]) # print values for reference



In [None]:
all_activities

In [None]:
# only take data from 3 random exercises
activities_to_process = all_activities
print(activities_to_process)

In [None]:
# define dictionaries for accelerometer and gyroscope data
# activities_accelerometer_data_dict = { activities_to_process[0]: [], activities_to_process[1]: [], activities_to_process[2]: []}



activities_gyroscope_data_dict = { activities_to_process[0]: [], activities_to_process[1]: [], activities_to_process[2]: []}
rep_counts_actual = []

# iterate over subject data to search for those activities and save the data related
for data_item in subject_data:
    for x in data_item:
        if len(x) > 0:
            if x[0] is not None and len(x[0]) > 0:
                data_activity_name = x[0,0].activityName[0]
                data_activity_reps = x[0,0].activityReps[0]
                data_item_accelDataMatrix = x[0,0].data[0,0].accelDataMatrix
                data_item_gyroDataMatrix = x[0,0].data[0,0].gyroDataMatrix
                if data_activity_name in activities_to_process:
                    activities_accelerometer_data_dict[data_activity_name].append(data_item_accelDataMatrix)
                    activities_gyroscope_data_dict[data_activity_name].append(data_item_gyroDataMatrix)
                    rep_counts_actual.append(data_activity_reps)

In [None]:
data_item_accelDataMatrix.shape

In [None]:
data_item_gyroDataMatrix.shape

#### Deeper look at files
Below cells are just to see what fields are in the matlab files. Turning them into dictionaries gives a better view of the mat_struct objects

In [None]:
for x in exercise_dataset:
    print (x)

In [None]:
type(exercise_dataset['exerciseConstants'][0][0])

In [None]:
def loadmat(filename):
    '''
    this function should be called instead of direct scipy.io.loadmat
    as it cures the problem of not properly recovering python dictionaries
    from mat files. It calls the function check keys to cure all entries
    which are still mat-objects
    '''
    data = scipy.io.loadmat(filename, struct_as_record=False, squeeze_me=True)
    return _check_keys(data)

def _check_keys(dict):
    '''
    checks if entries in dictionary are mat-objects. If yes
    todict is called to change them to nested dictionaries
    '''
    for key in dict:
        if isinstance(dict[key], scipy.io.matlab.mio5_params.mat_struct):
            dict[key] = _todict(dict[key])
    return dict        

def _todict(matobj):
    '''
    A recursive function which constructs from matobjects nested dictionaries
    '''
    dict = {}
    for strg in matobj._fieldnames:
        elem = matobj.__dict__[strg]
        if isinstance(elem, scipy.io.matlab.mio5_params.mat_struct):
            dict[strg] = _todict(elem)
        else:
            dict[strg] = elem
    return dict

exercise_dataset2 = loadmat('/Users/mani/Downloads/P/exerciserecognitionfromwearablesensors/exercise_data.50.0000_singleonly.mat')
exercise_dataset2

In [None]:
exercise_dataset2['exerciseConstants'].keys()

In [None]:
exercise_dataset2['subject_data'].shape

In [None]:
def _todict(matobj):
    '''
    A recursive function which constructs from matobjects nested dictionaries
    '''
    dict = {}
    for strg in matobj._fieldnames:
        elem = matobj.__dict__[strg]
        if isinstance(elem, scipy.io.matlab.mio5_params.mat_struct):
            dict[strg] = _todict(elem)
        else:
            dict[strg] = elem
    return dict

_todict(exercise_dataset2['subject_data'][0][3][0])

# TODO: what is boundingWindow?
# activityReps = ground truth for reps
# activityName = ground truth for name of activity/exercise


### EDA

In [None]:
print(len(all_activities))

In [None]:
activities_accelerometer_data_dict[activities_to_process[0]][1].shape

Below, we see that each recorded activity set has a varying duration.

In [None]:
for activity in activities_to_process:
    print("Activity: " + activity)
    print("Accelerometer data shape: " + str(len(activities_accelerometer_data_dict[activity])))
    print("Gyroscope data shape: " + str(len(activities_gyroscope_data_dict[activity])))
    print("")
    for i in activities_accelerometer_data_dict[activity]:
        print(activity + "accelerometer data_ray_shape: " + str(i.shape))
    for i in activities_accelerometer_data_dict[activity]:
        print(activity + "gyroscope data_ray_shape: " + str(i.shape))

### Visualizations

#### Accelerometer Measurements chart (only the first result of exercises per activity has been taken)

In [None]:
# Data to graph
for activity in activities_to_process:
       t, x, y, z = [], [], [], []

       for data_activity in activities_accelerometer_data_dict[activity][0]: # take only the values corresponding to the results of the first excersise
              t.append(data_activity[0]) # time value
              x.append(data_activity[1]) # X value
              y.append(data_activity[2]) # Y value
              z.append(data_activity[3]) # Z value

       fig, ax = plt.subplots()
       ax.plot(t, x, label = 'X')
       ax.plot(t, y, label = 'Y')
       ax.plot(t, z, label = 'Z')

       ax.set(xlabel='Time (seconds)', ylabel='Acceleration output (g)', title=activity)
       ax.grid()

       fig.tight_layout()
       fig.set_size_inches(25, 5)

       plt.legend()
       plt.show()

#### Gyroscope Measurements chart (only the first result of exercises per activity has been taken)

In [None]:
# Data to graph
for activity in activities_to_process:
       t, x, y, z = [], [], [], []

       for data_activity in activities_gyroscope_data_dict[activity][0]: # take only the values corresponding to the results of the first excersise
              t.append(data_activity[0]) # time value
              x.append(data_activity[1]) # X value
              y.append(data_activity[2]) # Y value
              z.append(data_activity[3]) # Z value

       fig, ax = plt.subplots()
       ax.plot(t, x, label = 'X')
       ax.plot(t, y, label = 'Y')
       ax.plot(t, z, label = 'Z')

       ax.set(xlabel='Time (seconds)', ylabel='Gyroscope output (g)', title=activity)
       ax.grid()

       fig.tight_layout()
       fig.set_size_inches(25, 5)

       plt.legend()
       plt.show()

## Segmentation Pre-Processing
Given data points containing x,y,z, and time, how would you smooth this data with a Butterworth low-pass filter (-60dB at 20Hz), then windowed into 5-second windows sliding at 200ms (i.e., each 5s window shares 4.8s of data with the previous window)

In [None]:
def apply_butterworth_lowpass(data, sampling_rate, cutoff_frequency, filter_order):
    nyquist_frequency = 0.5 * sampling_rate  # Updated line
    normalized_cutoff_frequency = cutoff_frequency / nyquist_frequency
    b, a = signal.butter(filter_order, normalized_cutoff_frequency, btype='low', analog=False, output='ba')
    smoothed_data = signal.lfilter(b, a, data)
    return smoothed_data

# Initialize a dictionary windowed_smoothed_data with keys as activities_to_process and corresponding values as empty lists
windowed_smoothed_data = {activity: [] for activity in activities_to_process}

# Assuming 'x', 'y', 'z', and 'time' are your data arrays in seconds
N = 4  # Filter order
sampling_rate = 1.0  # Sampling rate of 1 Hz
cutoff_frequency = 0.2  # Cutoff frequency of 0.2 Hz
filter_order = N  # Specify the desired filter order

# create embeddings
def preprocess(data_source, source_name):
    window_size = 250 # Window size of 5 seconds = 250 points
    cols = ['x', 'y', 'z', 't']
    cols_expanded = ([col + str(i) for col in cols for i in range(window_size)])
    windowed_smoothed_data_table = {col: [] for col in cols_expanded}
    windowed_smoothed_data_table['activity'] = []
    windowed_smoothed_data_table['set_num'] = []

    for activity in activities_to_process:
        for set_num, data_activity_set in enumerate(data_source[activity]):
            # Create empty arrays to store the windowed data
            windowed_x = []
            windowed_y = []
            windowed_z = []
            windowed_t = []

            t = data_activity_set[:,0]
            x = data_activity_set[:,1]
            y = data_activity_set[:,2]
            z = data_activity_set[:,3]

            # Normalize the data arrays to be less than abs(1)
            max_value = max(max(x), max(y), max(z), max(t))
            x_normalized = [value / max_value for value in x]
            y_normalized = [value / max_value for value in y]
            z_normalized = [value / max_value for value in z]

            # Apply the Butterworth filter (reduces noise)
            smoothed_x = apply_butterworth_lowpass(x_normalized, sampling_rate, cutoff_frequency, filter_order)
            smoothed_y = apply_butterworth_lowpass(y_normalized, sampling_rate, cutoff_frequency, filter_order)
            smoothed_z = apply_butterworth_lowpass(z_normalized, sampling_rate, cutoff_frequency, filter_order)

            # Slide the window over the smoothed data 
            overlap = 1 # Overlap of 4.8 seconds
            windowed_x = np.array([smoothed_x[i:i+window_size] for i in range(0, len(smoothed_x) - window_size + 1, overlap)])
            windowed_y = np.array([smoothed_y[i:i+window_size] for i in range(0, len(smoothed_y) - window_size + 1, overlap)])
            windowed_z = np.array([smoothed_z[i:i+window_size] for i in range(0, len(smoothed_z) - window_size + 1, overlap)])
            windowed_t = np.array([t[i:i+window_size] for i in range(0, len(t) - window_size + 1, overlap)])

            # The windowed_x, windowed_y, and windowed_z arrays now contain the windowed data
            window_row_count = windowed_x.shape[0]
            windowed_smoothed_data_table['activity'].extend([activity] * window_row_count)
            windowed_smoothed_data_table['set_num'].extend([set_num] * window_row_count)
            windowed_vals = [windowed_x, windowed_y, windowed_z, windowed_t]
            for c, col in enumerate(cols):
                for i in range(window_size):
                    windowed_smoothed_data_table[col+str(i)].extend(windowed_vals[c][:,i])
                    
    
    assert(len(windowed_smoothed_data_table['x0']) == len(windowed_smoothed_data_table['y0']) == len(windowed_smoothed_data_table['z0']) == len(windowed_smoothed_data_table['t0']))
    df = pd.DataFrame(windowed_smoothed_data_table)
    df['source'] = source_name
    return df


In [None]:
df_gyroscope = preprocess(activities_gyroscope_data_dict, 'gyroscope')
df_accelerometer = preprocess(activities_accelerometer_data_dict, 'accelerometer')

In [None]:
assert(len([col for col in df_gyroscope.columns if col.startswith('x')]) 
    == len([col for col in df_gyroscope.columns if col.startswith('y')]) 
    == len([col for col in df_gyroscope.columns if col.startswith('z')]) 
    == len([col for col in df_gyroscope.columns if col.startswith('t')]))
print("number of time points in each window ", len([col for col in df_gyroscope.columns if col.startswith('t')]))

In [None]:
assert(df_accelerometer['set_num'].max() == (len(df_gyroscope['set_num'].unique())-1))
# add a column called rep_counts_actual to df_accelerometer where the value for that set_num corresponds to the index in rep_counts_actual 
df_accelerometer['rep_counts_actual'] = df_accelerometer['set_num'].apply(lambda x: rep_counts_actual[x])
df_gyroscope['rep_counts_actual'] = df_gyroscope['set_num'].apply(lambda x: rep_counts_actual[x])


In [None]:
df_gyroscope.info()

In [None]:
# set the source, set_num, and activity columns as the index
df_gyroscope.set_index(['source', 'set_num', 'activity'], inplace=True)

## Segmentation Feature Computation
1) aX: the X-axis accelerometer signal

2) aXmag: the magnitude of the accelerometer signal at each sample, i.e. sqrt(ax2+ay2+az2).

3) aPC1: the projection of the three-dimensional accelerometer signal onto its first principal component. This is the movement along the axis that demonstrates the most variance within this window, or – anecdotally – themost “interesting” rotation of the window.

4) aYZPC1: the projection of only the Y and Z axes onto the first principal component of those two axes. This captures movement perpendicular to the arm, which allows us to derive information from the Y and Z axes despite the unknown rotation of the armband. 

In [None]:
def get_pc1(df):
    # perform PCA on each row of df with all columns that start with 'x' or 'y' or 'z'
    dim_cols = [col for col in df.columns if col.startswith('x') or col.startswith('y') or col.startswith('z') or col.startswith('t')]
    window = df[dim_cols].iloc[0]
    pc1 = []

    for w in range(len(df)):
        #for w in range(10):
        window = df[dim_cols].iloc[w]
        window_df = {}
        ts = list(window[[col for col in window.index if col.startswith('t')]].values)
        xs = list(window[[col for col in window.index if col.startswith('x')]].values)
        ys = list(window[[col for col in window.index if col.startswith('y')]].values)
        zs = list(window[[col for col in window.index if col.startswith('z')]].values)

        window_df = {'t': ts, 'x': xs, 'y': ys, 'z': zs}
        window_df = pd.DataFrame(window_df)
        window_df.set_index('t', inplace=True)

        # Perform PCA on the data
        pca = PCA(n_components=1) # only keep the first principal component
        principal_components = pca.fit_transform(window_df)
        pc1.append(principal_components[:, 0])
    assert(len(pc1) == len(df))
    return pc1

In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

def get_pc1(df):
    dim_cols = [col for col in df.columns if col.startswith('x') or col.startswith('y') or col.startswith('z') or col.startswith('t')]
    window = df[dim_cols].iloc[0]

    ts_cols = [col for col in window.index if col.startswith('t')]
    xyz_cols = [col for col in window.index if col.startswith('x') or col.startswith('y') or col.startswith('z')]

    window_df = pd.DataFrame(index=window[ts_cols].values, columns=['x', 'y', 'z'])

    pc1 = []

    for _, row in df[dim_cols].iterrows():
        window_df['x'] = row[xyz_cols[:len(ts_cols)]].values
        window_df['y'] = row[xyz_cols[len(ts_cols):2*len(ts_cols)]].values
        window_df['z'] = row[xyz_cols[2*len(ts_cols):]].values

        pca = PCA(n_components=1)
        principal_components = pca.fit_transform(window_df.values)
        pc1.append(principal_components[0, 0])

    assert len(pc1) == len(df)
    return pc1


In [None]:
# calculate percentage of nan in each column of df_gyroscope where percentage is greater than 0
nan_percentage = df_gyroscope.isna().sum() / len(df_gyroscope)
nan_percentage[nan_percentage > 0]


In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from joblib import Parallel, delayed

def calculate_pca(window):
    window_df = {}
    ts = list(window[[col for col in window.index if col.startswith('t')]].values)
    xs = list(window[[col for col in window.index if col.startswith('x')]].values)
    ys = list(window[[col for col in window.index if col.startswith('y')]].values)
    zs = list(window[[col for col in window.index if col.startswith('z')]].values)

    window_df = {'t': ts, 'x': xs, 'y': ys, 'z': zs}
    window_df = pd.DataFrame(window_df)
    window_df.set_index('t', inplace=True)

    pca = PCA(n_components=1)
    principal_components = pca.fit_transform(window_df)
    return principal_components

def get_pc1(df):
    dim_cols = [col for col in df.columns if col.startswith('x') or col.startswith('y') or col.startswith('z') or col.startswith('t')]
    pc1 = Parallel(n_jobs=-1)(delayed(calculate_pca)(row[dim_cols]) for w, row in df.iterrows())
    assert len(pc1) == len(df)
    return pc1

def add_pc_cols(df, pc, pc_name):
    # create columns for every index of pc[0] in df 
    for i in range(len(pc[0])):
        df[pc_name+'_'+str(i)] = [pc[j][i] for j in range(len(pc))]
    assert(len([col for col in df.columns if col.startswith(pc_name)]) == len(pc[0]))

In [None]:
pc1 = get_pc1(df_gyroscope)

In [None]:
print(len(pc1[0]))
print(len(df_gyroscope))
print(len(pc1))

add_pc_cols(df_gyroscope, pc1, 'pc1')

In [None]:
pc1 = get_pc1(df_accelerometer)
print(len(pc1[0]))
print(len(df_accelerometer))
print(len(pc1))

add_pc_cols(df_accelerometer, pc1, 'pc1')


In [None]:
# to do delete this version of the function as it's too slow
def get_pc2(df):
    # perform PCA on df.iloc[0] columns that start with 'x' or 'y' or 'z'
    dim_cols = [col for col in df.columns if col.startswith('x') or col.startswith('y') or col.startswith('z') or col.startswith('t')]
    pc2 = []
    for w in range(len(df)):
        window = df[dim_cols].iloc[w]
        window_df = {}
        ts = list(window[[col for col in window.index if col.startswith('t')]].values)
        ys = list(window[[col for col in window.index if col.startswith('y')]].values)
        zs = list(window[[col for col in window.index if col.startswith('z')]].values)

        window_df = {'t': ts, 'y': ys, 'z': zs}
        window_df = pd.DataFrame(window_df)
        window_df.set_index('t', inplace=True)

        # Perform PCA on the data
        pca = PCA(n_components=1)  # Set the number of components to 2
        principal_components = pca.fit_transform(window_df)

        # Create new columns for aPC1 and aPC2 in the DataFrame
        pc2.append(principal_components[:, 0])

    assert(len(pc2) == len(df))
    return pc2


In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from joblib import Parallel, delayed

def calculate_pcayz(window):
    window_df = {}
    ts = list(window[[col for col in window.index if col.startswith('t')]].values)
    ys = list(window[[col for col in window.index if col.startswith('y')]].values)
    zs = list(window[[col for col in window.index if col.startswith('z')]].values)

    window_df = {'t': ts, 'y': ys, 'z': zs}
    window_df = pd.DataFrame(window_df)
    window_df.set_index('t', inplace=True)

    pca = PCA(n_components=1)
    principal_components = pca.fit_transform(window_df)
    return principal_components

def get_pc2(df):
    dim_cols = [col for col in df.columns if col.startswith('y') or col.startswith('z') or col.startswith('t')]
    pc1 = Parallel(n_jobs=-1)(delayed(calculate_pcayz)(row[dim_cols]) for w, row in df.iterrows())
    assert len(pc1) == len(df)
    return pc1


In [None]:
yzpc = get_pc2(df_gyroscope)
print(len(yzpc[0]))
print(len(df_gyroscope))
print(len(yzpc))

add_pc_cols(df_gyroscope, yzpc, 'yzpc')

In [None]:
yzpc = get_pc2(df_accelerometer)
print(len(yzpc[0]))
print(len(df_accelerometer))
print(len(yzpc))

add_pc_cols(df_accelerometer, yzpc, 'yzpc')

In [None]:
# create embeddings
def get_raw_x(data_source):
    window_size = 250 # Window size of 5 seconds = 250 points
    temp_dict = { 'x_'+str(i): [] for i in range(window_size) }

    for activity in activities_to_process:
        for set_num, data_activity_set in enumerate(data_source[activity]):
            x = data_activity_set[:,1]
            overlap = 1 # Overlap of 4.8 seconds
            windowed_x = np.array([x[i:i+window_size] for i in range(0, len(x) - window_size + 1, overlap)])
            window_row_count = windowed_x.shape[0]
            
            for i in range(len(windowed_x[0])):
                temp_dict['x_'+str(i)].extend([windowed_x[j][i] for j in range(len(windowed_x))])
    return temp_dict


In [None]:
gyroscope_x = get_raw_x(activities_gyroscope_data_dict)
accelerometer_x = get_raw_x(activities_accelerometer_data_dict)
assert(len(gyroscope_x['x_0']) == len(df_gyroscope))

In [None]:
# add the raw x columns to df_gyroscope
for i in range(len(gyroscope_x.keys())):
    df_gyroscope['x_'+str(i)] = gyroscope_x['x_'+str(i)]
    df_accelerometer['x_'+str(i)] = accelerometer_x['x_'+str(i)]
assert(len(gyroscope_x.keys()) == len([col for col in df_gyroscope.columns if col.startswith('x_')]))

In [None]:
len(df_accelerometer.columns) == len(df_gyroscope.columns)
print(len(df_accelerometer.columns))

In [None]:
print(len(df_gyroscope.columns))

In [None]:
[col for col in df_accelerometer.columns if col not in df_gyroscope.columns]

In [None]:
# make df_gyroscope index into columns
df_gyroscope.reset_index(inplace=True)
assert(len(df_accelerometer.columns) == len(df_gyroscope.columns))

In [None]:
# TODO: should square root here instead of later
for i in range(250):
    df_gyroscope['xmag'+str(i)] = df_gyroscope['x'+str(i)]**2 + df_gyroscope['y'+str(i)]**2 + df_gyroscope['z'+str(i)]**2

In [None]:
# TODO: should square root here instead of later
for i in range(250):
    df_accelerometer['axmag'+str(i)] = df_accelerometer['x'+str(i)]**2 + df_accelerometer['y'+str(i)]**2 + df_accelerometer['z'+str(i)]**2

In [None]:
print(df_gyroscope.columns)

In [None]:
print(df_accelerometer.columns)

In [None]:
# write all column names to file
with open('gyroscope_column_names.txt', 'w') as f:
    for col in df_gyroscope.columns:
        f.write(col + '\n')

with open('accelerometer_column_names.txt', 'w') as f:
    for col in df_accelerometer.columns:
        f.write(col + '\n')


In [None]:
print(len(df_gyroscope.columns))
print(len(df_accelerometer.columns))
assert(len(df_accelerometer.columns) == len(df_gyroscope.columns))

In [None]:
df_gyroscope.columns = [col[2:] if col.endswith('source') 
    or col.endswith('set_num') or col.endswith('activity') or col.endswith('rep_counts_actual')
    else col for col in df_gyroscope.columns]
df_accelerometer.columns = [col[2:] if col.endswith('source') 
    or col.endswith('set_num') or col.endswith('activity') or col.endswith('rep_counts_actual')
    else col for col in df_gyroscope.columns]

In [None]:
mutual_cols = [col for col in df_gyroscope.columns if col.startswith('t')]
mutual_cols.extend(['rep_counts_actual'])
len(mutual_cols)

In [None]:
df_gyroscope.to_pickle('df_gyroscope0611.pkl')
df_accelerometer.to_pickle('df_accelerometer0611.pkl')

In [None]:
# join accelerometer and gyroscope dataframes on index
df = df_gyroscope.join(df_accelerometer, how='inner', lsuffix='_gyroscope', rsuffix='_accelerometer')
assert(len(df_gyroscope) == len(df) == len(df_accelerometer))

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# TODO: should've done this earlier
# for column name that contains "xmag" in df, replace value with the square root
for col in df.columns:
    if col.startswith('g_gxmag') or col.startswith('a_axmag'):
        df[col] = df[col].apply(lambda x: math.sqrt(x))

## Segmentation features (computed for each signal)


In [None]:
df.to_pickle("mergeddf_0611.pkl")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# turn activities_gyroscope_data_dict and activities_accelerometer_data_dict into a spark dataframe
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

# create schema for data
schema = StructType([
    StructField("overall_set_num", IntegerType(), True),
    StructField("activity_name", StringType(), True),
    StructField("activity_set_num", IntegerType(), True),
    StructField("time", DoubleType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("z", DoubleType(), True)
])

# create empty dataframes
gyroscope_df = ss.createDataFrame(sc.emptyRDD(), schema)
accelerometer_df = ss.createDataFrame(sc.emptyRDD(), schema)

# iterate over activities and append data to dataframes
overall_set_num = 0
for activity in activities_to_process:
    for activity_set_num, activity_set in enumerate(activities_gyroscope_data_dict[activity]):
        for time_point in activity_set:
            data_row = [(overall_set_num, str(activity), activity_set_num, float(time_point[0]), float(time_point[1]), float(time_point[2]), float(time_point[3]))]
            gyroscope_df = gyroscope_df.union(ss.createDataFrame(data_row, schema))
        overall_set_num += 1
    overall_set_num -= activity_set_num
    for activity_set_num, activity_set in enumerate(activities_accelerometer_data_dict[activity]):
        for time_point in activity_set:
            data_row = [(overall_set_num, str(activity), activity_set_num, float(time_point[0]), float(time_point[1]), float(time_point[2]), float(time_point[3]))]
            accelerometer_df = accelerometer_df.union(ss.createDataFrame(data_row, schema))
        overall_set_num += 1


# show dataframes 
gyroscope_df.show()
accelerometer_df.show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# turn activities_gyroscope_data_dict and activities_accelerometer_data_dict into a spark dataframe
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

# create schema for data
schema = StructType([
    StructField("overall_set_num", IntegerType(), True),
    StructField("activity_name", StringType(), True),
    StructField("activity_set_num", IntegerType(), True),
    StructField("time", DoubleType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("z", DoubleType(), True)
])

# create empty dataframes
gyroscope_df = ss.createDataFrame(sc.emptyRDD(), schema)
accelerometer_df = ss.createDataFrame(sc.emptyRDD(), schema)

gyroscope_data = []  
accelerometer_data = [] 

# Iterate over activities and append data to dataframes
overall_set_num = 0
for activity in activities_to_process:
    for activity_set_num, activity_set in enumerate(activities_gyroscope_data_dict[activity]):
        for time_point in activity_set:
            data_row = (overall_set_num, str(activity), activity_set_num, float(time_point[0]), float(time_point[1]), float(time_point[2]), float(time_point[3]))
            gyroscope_data.append(data_row)
        overall_set_num += 1
    overall_set_num -= activity_set_num
    for activity_set_num, activity_set in enumerate(activities_accelerometer_data_dict[activity]):
        for time_point in activity_set:
            data_row = (overall_set_num, str(activity), activity_set_num, float(time_point[0]), float(time_point[1]), float(time_point[2]), float(time_point[3]))
            accelerometer_data.append(data_row)
        overall_set_num += 1

# Create dataframes directly from the lists of data rows
gyroscope_df = ss.createDataFrame(gyroscope_data, schema)
accelerometer_df = ss.createDataFrame(accelerometer_data, schema)

# Show dataframes
gyroscope_df.show()
accelerometer_df.show()

In [None]:
# Create empty lists to collect data points
gyroscope_data = []
accelerometer_data = []

overall_set_num = 0
for activity in activities_to_process:
    for activity_set_num, activity_set in enumerate(activities_gyroscope_data_dict[activity]):
        for time_point in activity_set:
            data_row = (overall_set_num, str(activity), activity_set_num, float(time_point[0]), float(time_point[1]), float(time_point[2]), float(time_point[3]))
            gyroscope_data.append(data_row)
        overall_set_num += 1
    overall_set_num -= activity_set_num
    for activity_set_num, activity_set in enumerate(activities_accelerometer_data_dict[activity]):
        for time_point in activity_set:
            data_row = (overall_set_num, str(activity), activity_set_num, float(time_point[0]), float(time_point[1]), float(time_point[2]), float(time_point[3]))
            accelerometer_data.append(data_row)
        overall_set_num += 1


In [None]:
gyroscope_data[0:5]

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# turn activities_gyroscope_data_dict and activities_accelerometer_data_dict into a spark dataframe
ss = SparkSession.builder.getOrCreate()
# sc = ss.sparkContext

# spark = SparkSession.builder.getOrCreate()


In [None]:
data_rdd = ss.sparkContext.parallelize(gyroscope_data)


In [None]:
schema = StructType([
    StructField("overall_set_num", IntegerType(), True),
    StructField("activity_name", StringType(), True),
    StructField("activity_set_num", IntegerType(), True),
    StructField("time", DoubleType(), True),
    StructField("x", DoubleType(), True),
    StructField("y", DoubleType(), True),
    StructField("z", DoubleType(), True)
])

In [None]:
df = spark.createDataFrame(data_rdd, schema)

In [None]:
df.show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Create a SparkSession
ss = SparkSession.builder.getOrCreate()

# Define the schema for gyroscope and accelerometer data
schema = StructType([
    StructField("timestamp", LongType(), nullable=False),
    StructField("x", DoubleType(), nullable=False),
    StructField("y", DoubleType(), nullable=False),
    StructField("z", DoubleType(), nullable=False)
])

# Convert gyroscope_data and accelerometer_data to a list of rows
gyroscope_rows = [(k, v[0], v[1], v[2]) for k, v in activities_gyroscope_data_dict.items()]
accelerometer_rows = [(k, v[0], v[1], v[2]) for k, v in activities_accelerometer_data_dict.items()]

# Create DataFrames from the collected data
gyroscope_df = ss.createDataFrame(gyroscope_rows, schema)
accelerometer_df = ss.createDataFrame(accelerometer_rows, schema)

# Show the DataFrames
gyroscope_df.show()
accelerometer_df.show()
