In [1]:
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import pandas as pd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, udf
from pyspark.sql.types import *
import random
import scipy.signal as signal
import scipy.io
import warnings

warnings.filterwarnings('ignore')

In [15]:
!ls ../bigfiles

[34maccelerometer_allsets.csv[m[m            [34mgyroscope_data0620.csv[m[m
[34maccelerometer_data0620.csv[m[m           [34mgyroscope_data0624.csv[m[m
[34maccelerometer_data0624.csv[m[m           [34mgyroscope_features0624.csv[m[m
exercise_data.50.0000_singleonly.mat [34mjoined_data0620.csv[m[m


In [16]:
# Load dataframes from csv
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext
sc.setLogLevel("ERROR")

gyroscope_df = ss.read.csv("../bigfiles/gyroscope_data0624.csv", header=True, inferSchema=True)


                                                                                

AnalysisException: Path does not exist: file:/Users/patricianatalyornelasjauregui/Library/CloudStorage/GoogleDrive-pnornelasjauregui@dons.usfca.edu/Other computers/My_MacBook_Pro/USF/DeepLearning/repo/bigfiles/accelerometer_data0624

In [19]:
accelerometer_df = ss.read.csv("../bigfiles/accelerometer_data0624.csv", header=True, inferSchema=True)


                                                                                

In [20]:
def apply_butterworth_lowpass(data):
    sampling_rate = 50 # sampling_rate of 50 Hz
    cutoff_freq = 20 # 20 Hz as specified by the paper (TODO try different values)
    filter_order = 4  # Specify the desired filter order (TODO try different values)
    nyquist_frequency = 0.5 * sampling_rate  
    normalized_cutoff_frequency = cutoff_freq / nyquist_frequency
    b, a = signal.butter(filter_order, normalized_cutoff_frequency, btype='low', analog=False, output='ba')
    smoothed_data = signal.lfilter(b, a, data)
    return smoothed_data#.flatten()


In [21]:
def process_set(data_source, source_name, overall_set_num):
    window_duration = 5  # Window duration in seconds
    sampling_rate = 50  # Sampling rate in Hz
    window_size = int(window_duration * sampling_rate) # 250
    overlap = 10 # distinct 10 points 200ms, shared 4.8 seconds => 240 overlap

    # initialize schemas for accelerometer and gyroscope pyspark dataframes
    # accelerometer features: aYZPC and aX
    # gyroscope features: gPC
    aX_cols = [f"x{i}" for i in range(window_size)]
    aYZPC1_cols = [f"aYZPC{i}" for i in range(window_size)]
    gPC1_cols = [f"gPC{i}" for i in range(window_size)]

    common_fields = [StructField("window_index", IntegerType(), True),
        StructField("overall_set_num", IntegerType(), True),
        StructField("activity_name", StringType(), True),
        StructField("activity_set_num", IntegerType(), True)]

    yz_signal_cols = ['y','z']
    xyz_signal_cols = ['x'] + yz_signal_cols
    yzfields = [StructField(f"y", FloatType(), True), 
        StructField(f"z", FloatType(), True)]
    
    # set the correct schema depending on the source_name
    if source_name == "accelerometer":
        pca_schema = StructType(yzfields)
        input_cols = yz_signal_cols
        output_schema = StructType(common_fields + [StructField(col, FloatType(), True) for col in aX_cols + aYZPC1_cols])
    elif source_name == "gyroscope":
        pca_schema = StructType(yzfields + \
            [StructField(f"x", FloatType(), True)])
        input_cols = xyz_signal_cols
        output_schema = StructType(common_fields + [StructField(col, FloatType(), True) for col in gPC1_cols])
        
    # get current set information
    set_data = data_source.filter(data_source.overall_set_num == overall_set_num)
    activity_name = set_data.select('activity_name').first()[0]
    activity_set_num = set_data.select('activity_set_num').first()[0]

    # apply butterworth lowpass filter to x, y, z columns
    x = list(set_data.select(set_data['x']).toPandas()['x'])
    y = list(set_data.select(set_data['y']).toPandas()['y'])
    z = list(set_data.select(set_data['z']).toPandas()['z'])
    smoothed_x = apply_butterworth_lowpass(x)
    smoothed_y = apply_butterworth_lowpass(y)
    smoothed_z = apply_butterworth_lowpass(z)

    # Slide the window over to compute features for each 5 second interval/window
    result_table = []
    for i in range(0, len(smoothed_x) - window_size + 1, overlap):
        # Get the windowed data that we'll use to perform PCA
        individual_window_table = {col: [] for col in input_cols} # initialize empty table
        if source_name == "gyroscope": # to compute gPC1 with all 3 dimensions (x,y,z)
            individual_window_table["x"] = smoothed_x[i:i+window_size]
        individual_window_table["y"] = smoothed_y[i:i+window_size]
        individual_window_table["z"] = smoothed_z[i:i+window_size]

        # Create dict individual_window_table to pandas DF
        individual_window_table = pd.DataFrame.from_dict(individual_window_table)
        # Convert the pandas DataFrame to a PySpark DataFrame
        pyspark_window_df = ss.createDataFrame(individual_window_table, pca_schema)

        # Combine the input columns into a single vector column
        assembler = VectorAssembler(inputCols=input_cols, outputCol='features_vect')
        assembled_df = assembler.transform(pyspark_window_df)

        # Perform PCA and keep only the first principal component
        pca = PCA(k=1, inputCol='features_vect', outputCol='pca_features')
        pca_model = pca.fit(assembled_df)
        pca_result = pca_model.transform(assembled_df).select('pca_features')

        # Flatten the pca_result DataFrame so that each element is a floatType and then transpose it
        pca_result = pca_result.rdd.map(lambda x: [float(element) for element in x.pca_features.toArray()]) \
            .flatMap(lambda x: x).collect()

        # Append the pca_row and other columns to the result_table
        # TODO: maybe using smoothed_x would be better than raw x, try this to see if it improves performance
        if source_name == "accelerometer":
            result_table.append([i, overall_set_num, activity_name, activity_set_num] + x[i:i+window_size] + pca_result)
        elif source_name == "gyroscope":
            result_table.append([i, overall_set_num, activity_name, activity_set_num] + pca_result)

    # Write result_table to a file
    filename = f'bigfiles/{source_name}/result_table_{overall_set_num}_0625.csv'
    with open(filename, 'w') as file:
        writer = csv.writer(file)
        writer.writerows(result_table)

    return filename


def preprocess(data_source, source_name):
    # Iterate over all distinct set_nums in the data table
    # So that we can process 1 exercise set's signal in all 4 dimensions (x, y, z, t) at a time
    # Use all of the available CPU cores to preprocess all of the sets in parallel
    overall_set_nums = data_source.select('overall_set_num').distinct().rdd.flatMap(lambda x: x).collect()
    num_processes = multiprocessing.cpu_count()  
    pool = multiprocessing.Pool(processes=num_processes)
    output_filenames = pool.map(process_set, overall_set_nums)
    return output_filenames

In [22]:
accelerometer_features = preprocess(accelerometer_df, 'accelerometer')

Process SpawnPoolWorker-1:                                                      
Traceback (most recent call last):
  File "/Users/patricianatalyornelasjauregui/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/patricianatalyornelasjauregui/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/patricianatalyornelasjauregui/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/patricianatalyornelasjauregui/opt/anaconda3/lib/python3.9/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'process_set' on <module '__main__' (built-in)>
Process SpawnPoolWorker-2:
Traceback (most recent call last):
  File "/Users/patricianatalyornelasjauregui/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()

In [23]:
# save accelerometer_features to csv
accelerometer_features.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("bigfiles/accelerometer_features0624.csv")

NameError: name 'accelerometer_features' is not defined