In [11]:
import scipy.io
#import matplotlib.pyplot as plt
import numpy as np
import random
#import scipy.signal as signal
import pandas as pd
#from sklearn.decomposition import PCA
import math
import os
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import *
from pyspark.sql.types import *

import warnings
warnings.filterwarnings('ignore')

## Load & Explore Data

#### Download and save the 'exercise_data.50.0000_singleonly.mat' file from the below link and save it locally in the same folder as this notebook file. 

https://msropendata.com/datasets/799c1167-2c8f-44c4-929c-227bf04e2b9a

In [12]:
dataBaseDir = "data/exercise_recognition_from_wearable_sensors"
single_activity_file_path = os.path.join(dataBaseDir, "exercise_data.50.0000_singleonly.mat")
# Load exercise dataset 
exercise_dataset = scipy.io.loadmat(single_activity_file_path, struct_as_record=False)

In [3]:
for k in exercise_dataset.keys():
    print(f"{k}: {len(exercise_dataset[k])}")

__header__: 76
__version__: 3
__globals__: 0
subject_data: 94
exerciseConstants: 1
Fs: 1


In [13]:
# Load activities and data full objects
exercise_constants = exercise_dataset['exerciseConstants'][0][0].activities
subject_data = exercise_dataset['subject_data']

# extract activities names into an array
all_activities = []
for act in exercise_constants[0]:
    all_activities.append(act[0])
    print (act[0]) # print values for reference

# only take data from 3 random exercises
activities_to_process = random.choices(all_activities, k=3)

<Initial Activity>
Arm Band Adjustment
Arm straight up
Band Pull-Down Row
Bicep Curl
Biceps Curl (band)
Box Jump (on bench)
Burpee
Butterfly Sit-up
Chest Press (rack)
Crunch
Device on Table
Dip
Dumbbell Deadlift Row
Dumbbell Row (knee on bench) (label spans both arms)
Dumbbell Row (knee on bench) (left arm)
Dumbbell Row (knee on bench) (right arm)
Dumbbell Squat (hands at side)
Dynamic Stretch (at your own pace)
Elliptical machine
Fast Alternating Punches
Invalid
Jump Rope
Jumping Jacks
Kettlebell Swing
Lateral Raise
Lawnmower (label spans both arms)
Lawnmower (left arm)
Lawnmower (right arm)
Lunge (alternating both legs, weight optional)
Medicine Ball Slam
Non-Exercise
Note
Overhead Triceps Extension
Overhead Triceps Extension (label spans both arms)
Plank
Power Boat pose
Pushup (knee or foot variation)
Pushups
Repetitive Stretching
Rest
Rowing machine
Running (treadmill)
Russian Twist
Seated Back Fly
Shoulder Press (dumbbell)
Side Plank Left side
Side Plank Right side
Sit-up (hands p

In [14]:
activities_to_process

['Pushup (knee or foot variation)', 'Bicep Curl', 'Unlisted Exercise']

In [15]:
len(all_activities)

75

In [16]:
# remove invalid activities from all_activities list
invalid_activities = ['Arm Band Adjustment', 'Arm straight up', '<Initial Activity>', 'Note', 'Rest', 'Invalid',
                      'Tap IMU Device', 'Unlisted Exercise', 'Device on Table', 'Tap Left Device', 'Tap Right Device',
                      'Non-Exercise', 'Triceps extension (lying down) (left arm)',
                      'Triceps extension (lying down) (right arm)', 'Alternating Dumbbell Curl']
for act in invalid_activities:
    all_activities.remove(act)
len(all_activities)

60

In [17]:
# define dictionaries for accelerometer and gyroscope data
activities_accelerometer_data_dict = {activity: [] for activity in all_activities}
activities_gyroscope_data_dict = {activity: [] for activity in all_activities}

# iterate over subject data to search for those activities and save the data related
for data_item in subject_data:
    for x in data_item:
        if len(x) > 0:
            if x[0] is not None and len(x[0]) > 0:
                data_activity_name = x[0,0].activityName[0]
                data_item_accelDataMatrix = x[0,0].data[0,0].accelDataMatrix
                data_item_gyroDataMatrix = x[0,0].data[0,0].gyroDataMatrix
                if data_activity_name in all_activities:
                    activities_accelerometer_data_dict[data_activity_name].append(data_item_accelDataMatrix)
                    activities_gyroscope_data_dict[data_activity_name].append(data_item_gyroDataMatrix)

## Segmentation Pre-Processing
Given data points containing x,y,z, and time, how would you smooth this data with a Butterworth low-pass filter (-60dB at 20Hz), then windowed into 5-second windows sliding at 200ms (i.e., each 5s window shares 4.8s of data with the previous window)

## Segmentation Feature Computation
1) aX: the X-axis accelerometer signal

2) aXmag: the magnitude of the accelerometer signal at each sample, i.e. sqrt(ax2+ay2+az2).

3) aPC1: the projection of the three-dimensional accelerometer signal onto its first principal component. This is the movement along the axis that demonstrates the most variance within this window, or – anecdotally – themost “interesting” rotation of the window.

4) aYZPC1: the projection of only the Y and Z axes onto the first principal component of those two axes. This captures movement perpendicular to the arm, which allows us to derive information from the Y and Z axes despite the unknown rotation of the armband.

In [18]:
# Increase memory allocation
spark = SparkSession.builder.config("spark.driver.memory", "16g").config("spark.executor.memory", "16g").getOrCreate()

# turn activities_gyroscope_data_dict and activities_accelerometer_data_dict into a spark dataframe
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

# create schema for data
schema = StructType([
    StructField('overall_set_num', IntegerType(), True),
    StructField('activity_name', StringType(), True),
    StructField('activity_set_num', IntegerType(), True),
    StructField('time', DoubleType(), True),
    StructField('x', DoubleType(), True),
    StructField('y', DoubleType(), True),
    StructField('z', DoubleType(), True)
])

# create empty dataframes
gyroscope_df = ss.createDataFrame(sc.emptyRDD(), schema)
accelerometer_df = ss.createDataFrame(sc.emptyRDD(), schema)
gyroscope_data = []
accelerometer_data = []

# Iterate over activities and append data to dataframes
overall_set_num = 0
for activity in all_activities:
    for activity_set_num, activity_set in enumerate(activities_gyroscope_data_dict[activity]):
        for time_point in activity_set:
            data_row = (overall_set_num, str(activity), activity_set_num, float(time_point[0]), float(time_point[1]), float(time_point[2]), float(time_point[3]))
            gyroscope_data.append(data_row)
        overall_set_num += 1
    overall_set_num -= activity_set_num
    for activity_set_num, activity_set in enumerate(activities_accelerometer_data_dict[activity]):
        for time_point in activity_set:
            data_row = (overall_set_num, str(activity), activity_set_num, float(time_point[0]), float(time_point[1]), float(time_point[2]), float(time_point[3]))
            accelerometer_data.append(data_row)
        overall_set_num += 1
# Create dataframes directly from the lists of data rows
gyroscope_df = ss.createDataFrame(gyroscope_data, schema).persist()
accelerometer_df = ss.createDataFrame(accelerometer_data, schema).persist()

# Show dataframes
gyroscope_df.show()
accelerometer_df.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/06/26 15:10:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[Stage 0:>                                                          (0 + 0) / 1]

23/06/26 15:11:33 WARN TaskSetManager: Stage 0 contains a task of very large size (44569 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+---------------+------------------+----------------+-----------------+-------------------+-------------------+-------------------+
|overall_set_num|     activity_name|activity_set_num|             time|                  x|                  y|                  z|
+---------------+------------------+----------------+-----------------+-------------------+-------------------+-------------------+
|              0|Band Pull-Down Row|               0|674.1154982887862| -75.42962715593848|-23.459580529916664| -7.287340262552914|
|              0|Band Pull-Down Row|               0|674.1354981552279|  -37.8058679867162| -11.59542816826023|-16.031327760189498|
|              0|Band Pull-Down Row|               0|674.1554980216698|-30.373596887806915|-0.7170596262406321|-22.053160641018366|
|              0|Band Pull-Down Row|               0|674.1754978881116| -25.18043401739627|  9.136661226161516| -16.78585471129029|
|              0|Band Pull-Down Row|               0|674.1954977545535| -13.

[Stage 1:>                                                          (0 + 1) / 1]

+---------------+------------------+----------------+-----------------+-------------------+--------------------+--------------------+
|overall_set_num|     activity_name|activity_set_num|             time|                  x|                   y|                   z|
+---------------+------------------+----------------+-----------------+-------------------+--------------------+--------------------+
|              1|Band Pull-Down Row|               0|674.1154982887862| 0.1276259925697181| -0.6812928678115924|-0.14625478637439904|
|              1|Band Pull-Down Row|               0|674.1354981552279|0.07272240139603864|-0.46817981683837834|6.388127509546412E-4|
|              1|Band Pull-Down Row|               0|674.1554980216698| 0.0442798019759728| -0.4179801589818037| 0.13054040138988118|
|              1|Band Pull-Down Row|               0|674.1754978881116|0.18147223442236618|-0.49043102365636226| 0.14143943077158078|
|              1|Band Pull-Down Row|               0|674.19549

                                                                                

In [10]:
assert(gyroscope_df.count() == accelerometer_df.count())
print(gyroscope_df.count())

23/06/26 14:53:03 WARN TaskSetManager: Stage 2 contains a task of very large size (44569 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/06/26 14:53:08 WARN TaskSetManager: Stage 5 contains a task of very large size (44572 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

23/06/26 14:53:12 WARN TaskSetManager: Stage 8 contains a task of very large size (44569 KiB). The maximum recommended task size is 1000 KiB.


[Stage 8:>                                                          (0 + 8) / 8]

6134191


                                                                                

In [19]:
from pyspark.sql.functions import col

# Join gyroscope_df and accelerometer_df on overall_set_num, activity_set_num, activity_name, and time
df = gyroscope_df.alias('gyro').join(
    accelerometer_df.alias('acc'),
    (col('gyro.activity_set_num') == col('acc.activity_set_num')) &
    (col('gyro.activity_name') == col('acc.activity_name')) &
    (col('gyro.time') == col('acc.time')),
    'inner'
).select(
    col('gyro.activity_set_num').alias('set_num'),
    col('gyro.activity_name').alias('activity'),
    col('gyro.time'),
    col('gyro.x').alias('x_gyro'),
    col('gyro.y').alias('y_gyro'),
    col('gyro.z').alias('z_gyro'),
    col('acc.x').alias('x_acc'),
    col('acc.y').alias('y_acc'),
    col('acc.z').alias('z_acc')
).persist()

df.show()

23/06/26 15:11:38 WARN TaskSetManager: Stage 2 contains a task of very large size (44572 KiB). The maximum recommended task size is 1000 KiB.


[Stage 2:>                  (0 + 8) / 8][Stage 3:>                  (0 + 0) / 8]

23/06/26 15:11:44 WARN TaskSetManager: Stage 3 contains a task of very large size (44569 KiB). The maximum recommended task size is 1000 KiB.




+-------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+
|set_num|          activity|              time|             x_gyro|             y_gyro|             z_gyro|              x_acc|               y_acc|               z_acc|
+-------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+
|      0|Band Pull-Down Row| 676.1154849329704| -15.14763447507412| 23.954241105837042| 14.705304192571623| 0.3569856777362959| -0.8234100781781484| -0.0311893407993077|
|      0|Band Pull-Down Row| 679.0554652999215|-148.99338894204254|-1.7270497479444022|  -8.24358923776973|0.01903238626232883| -0.8876219858839315|-0.21587195539957008|
|      0|Band Pull-Down Row| 679.6154615602932| 147.98847345454536| 3.7689166045538247|   95.4527350396289| 0.5575600651872523| -1.1305198541798316|  

                                                                                

In [20]:
df.count()

                                                                                

6134191

In [21]:
# Convert activity names to numeric labels
labelIndexer = StringIndexer(inputCol="activity", outputCol="label")
df = labelIndexer.fit(df).transform(df)

# Define the feature columns
featureCols = ["x_gyro", "y_gyro", "z_gyro", "x_acc", "y_acc", "z_acc"]

# Convert the PySpark DataFrames to Pandas DataFrames
df_pd = df.toPandas()
df_pd.shape

                                                                                

(6134191, 10)

In [22]:
df_pd.head()

Unnamed: 0,set_num,activity,time,x_gyro,y_gyro,z_gyro,x_acc,y_acc,z_acc,label
0,0,Band Pull-Down Row,676.115485,-15.147634,23.954241,14.705304,0.356986,-0.82341,-0.031189,46.0
1,0,Band Pull-Down Row,679.055465,-148.993389,-1.72705,-8.243589,0.019032,-0.887622,-0.215872,46.0
2,0,Band Pull-Down Row,679.615462,147.988473,3.768917,95.452735,0.55756,-1.13052,0.6881,46.0
3,0,Band Pull-Down Row,681.575448,-140.093738,-29.372863,32.133631,0.448862,-0.73518,0.236496,46.0
4,0,Band Pull-Down Row,684.775427,-21.686398,-1.344477,-0.517016,0.367619,-0.557327,0.755909,46.0


In [23]:
# save the processed dataframe to csv file
df_pd.to_csv('exercise_recognition_data.csv', index=False)

In [24]:
# Clear the cache after converting pyspark dataframe into pandas df
gyroscope_df.unpersist()
accelerometer_df.unpersist()
df.unpersist()

DataFrame[set_num: int, activity: string, time: double, x_gyro: double, y_gyro: double, z_gyro: double, x_acc: double, y_acc: double, z_acc: double, label: double]

In [25]:
# shut down spark to save memory
spark.stop()