In [None]:
import os
import pickle
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
from datetime import timedelta

##### Processing PPG Data from Parquet Files

<ul style="font-size: 0.8em;">
This following code snippet processes Photoplethysmography (PPG) data stored in Parquet files. It extracts relevant data columns from each file, consolidates the information, and saves the structured data as pickle files. The key steps include:

  <li><strong>Reading Parquet Files:</strong> The script reads specific columns from each Parquet file to optimize performance.</li>
  <li><strong>Data Consolidation:</strong> Data from multiple files is concatenated into a single DataFrame, ensuring consistency and adding user identifiers.</li>
  <li><strong>Data Cleaning:</strong> The DataFrame is cleaned by removing duplicates and sorting by local time.</li>
  <li><strong>Grouping and Saving:</strong> The cleaned data is grouped by day and start hour, then each group is saved as a separate pickle file.</li>

This process is executed for each user directory in parallel, leveraging multiple cores for efficient processing.  
</ul>

In [None]:
from tqdm.notebook import tqdm  
from tqdm.auto import tqdm  # For automatic selection of appropriate tqdm implementation


def extract_ppg_df(udir: str, input_dir: str, output_dir: str) -> None:
    """
    Extracts PPG (Photoplethysmography) data from parquet files within a user directory,
    processes it, and saves the resulting DataFrames as pickle files grouped by day and start hour.

    Parameters:
    - udir (str): User directory name.
    - input_dir (str): Path to the input directory containing user directories.
    - output_dir (str): Path to the output directory where processed files will be saved.

    Returns:
    - None
    """

    udir_path = os.path.join(input_dir, udir)
    user = udir.split('=')[1]  # Extract user identifier from directory name

    all_data = []
    # List all parquet files within the user directory
    files = [file for file in os.listdir(udir_path) if file.endswith('parquet')]
    
    # Process each parquet file
    for file in tqdm(files, desc=f"Processing parquet files for {udir}", leave=True):
        filepath = os.path.join(udir_path, file)
        # Read specific columns from parquet file for efficiency
        df = pd.read_parquet(filepath, columns=['localtime', 'ppg1', 'day', 'start_hour', 'end_hour'])
        all_data.append(df)
        
    # Concatenate all data into a single DataFrame
    ppg_df = pd.concat(all_data, ignore_index=True)
    ppg_df['user'] = user  # Add user identifier column

    # Clean and organize data by removing duplicates and sorting
    ppg_df = (ppg_df
              .drop_duplicates()
              .sort_values(by='localtime')
              .reset_index(drop=True))

    # Group data by day and start hour, then save each group as a pickle file
    grp_dfs = ppg_df.groupby(['day', 'start_hour'])
    for (day, st_hr), df in grp_dfs:
        filename = f"{user}_{day}_{st_hr}.pickle"
        with open(os.path.join(output_dir, filename), 'wb') as f:
            pickle.dump(df, f)

# Define input and output directories
input_dir = os.path.join(os.getcwd(), 'SSL_ppg_100_days')
output_dir = os.path.join(os.getcwd(), 'dfs_ppg_100_days')
os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist

# Get list of user directories
user_dirs = [udir for udir in os.listdir(input_dir) if udir.startswith('user')]

# Process each user directory in parallel
list(
    tqdm(
        Parallel(return_as="generator", n_jobs=10)(
            delayed(extract_ppg_df)(udir, input_dir, output_dir) 
            for udir in tqdm(user_dirs, desc="Processing for uid: {udir}")
        ),
        total=len(user_dirs),
    )
)

##### Create a dataframe of file names of stored pickle files. For consistency and seamless transition to foundation models research with multi-modality data (PPG + ACC), match PPG files with corresponding ACC files for the same day and hour of a user. For efficient matching, parquet files were converted to pickle files rather than directly to numpy files. 

In [None]:
output_dir = os.path.join(os.getcwd(), 'dfs_ppg_100_days')
PPG_available = []
for filename in os.listdir(output_dir):
    PPG_available.append(filename.split('.')[0].split('_'))

PPG_available_df = (pd.DataFrame(PPG_available, columns=['user', 'day', 'hour'])
                    .astype({'user': str, 
                             'day': 'datetime64[ns]',  # covert to datetime from object type
                             'hour': 'datetime64[ns]'}) # covert to datetime from object type
                    .sort_values(by=['user', 'day', 'hour']))

PPG_available_df.head()

Unnamed: 0,user,day,hour
24782,00222a15-7274-34b7-990a-81ac7d742220,2022-09-20,2022-09-20 11:00:00
18461,00222a15-7274-34b7-990a-81ac7d742220,2022-09-20,2022-09-20 12:00:00
21814,00222a15-7274-34b7-990a-81ac7d742220,2022-09-20,2022-09-20 13:00:00
31237,00222a15-7274-34b7-990a-81ac7d742220,2022-09-20,2022-09-20 14:00:00
27271,00222a15-7274-34b7-990a-81ac7d742220,2022-09-20,2022-09-20 15:00:00


##### Match available PPG and ACC files

In [5]:
with open("ACC_available_df.pickle", 'rb') as f:
    ACC_available_df = pickle.load(f)
    
print(f"{ACC_available_df['user'].nunique()} users with {len(ACC_available_df)} available ACC files")
print(f"{PPG_available_df['user'].nunique()} users with {len(PPG_available_df)} available PPG files")

120 users with 72609 available ACC files
120 users with 72649 available PPG files


In [6]:
PPG_ACC_available = pd.merge(PPG_available_df, ACC_available_df, 
                             on=['user', 'day', 'hour'], 
                             how='inner')
print(f"{PPG_ACC_available['user'].nunique()} users with {len(PPG_ACC_available)} matching PPG, ACC files")

120 users with 72590 matching PPG, ACC files


##### Combining PPG and ACC Data for Processing

<ul style="font-size: 0.8em;">
This code snippet efficiently combines Photoplethysmography (PPG) and Accelerometer (ACC) data for specified time windows and saves the results as numpy arrays. The process involves:

  <li><strong>Reading and Sorting Data:</strong> PPG and ACC data are read from pickled DataFrames and sorted by local time.</li>
  <li><strong>Windowed Processing:</strong> Data is processed in time windows, ensuring sufficient data points and performing necessary downsampling or interpolation.</li>
  <li><strong>Data Combination:</strong> The PPG and ACC data are combined into a single array for each window.</li>
  <li><strong>Parallel Execution:</strong> The operation is executed in parallel using joblib to enhance performance, especially when processing large datasets.</li>

This approach allows for the efficient handling and analysis of large-scale PPG and ACC datasets for dynamic window sizes. The same code snippet can be used for 1-min, 2-min training data preparation with corresponding directory names. 
</ul>

In [None]:
from datetime import timedelta

def combined_ppg_acc(row: pd.Series, PPG_df_directory: str, ACC_df_directory: str, output_np_dir: str, window_size: int) -> None:
    """
    Combines PPG and ACC data for a given time window and saves the result as a numpy file.

    Parameters:
    - row (pd.Series): A row from a DataFrame containing 'user', 'day', and 'hour' information.
    - PPG_df_directory (str): Directory path where PPG DataFrames are stored.
    - ACC_df_directory (str): Directory path where ACC DataFrames are stored.
    - output_np_dir (str): Directory path to save the combined numpy arrays.
    - window_size (int): Size of the time window in seconds.

    Returns:
    - None
    """
    
    user = row['user']        
    day = row['day'].date()
    day_hr = row['hour']
    
    file = f"{user}_{day}_{day_hr}.pickle"
    
    # Load the PPG and ACC data
    with open(os.path.join(PPG_df_directory, file), "rb") as f:
        PPG_df = pickle.load(f)
    with open(os.path.join(ACC_df_directory, file), "rb") as f:
        ACC_df = pickle.load(f)

    # Sort data by local time
    PPG_df = PPG_df.sort_values(by=['localtime'])
    ACC_df = ACC_df.sort_values(by=['localtime'])

    # Initialize start and end times for the window
    st_ppg = PPG_df['localtime'].iloc[0]
    et_ppg = st_ppg + timedelta(seconds=window_size)
    st_acc = ACC_df['localtime'].iloc[0]
    et_acc = st_acc + timedelta(seconds=window_size)

    # Process each time window
    while et_ppg <= PPG_df['localtime'].iloc[-1] and et_acc <= ACC_df['localtime'].iloc[-1]:
        win_min = st_ppg.minute
        
        # If the window_size is less than 60 seconds, the filename pattern f"{user}/{day}/{day_hr}/{day_hr}:{win_min}.npy" 
        # could lead to file overwrites, as multiple 10-second windows within the same minute would have identical filenames. 
        # Therefore, including seconds in the filename is necessary to ensure each file is uniquely saved.        
        if window_size < 60:          
            win_sec = st_ppg.second

        # Extract data for the current window
        ppg_win_data = PPG_df.loc[(PPG_df['localtime'] >= st_ppg) & (PPG_df['localtime'] < et_ppg), ['ppg1']]
        acc_win_data = ACC_df.loc[(ACC_df['localtime'] >= st_acc) & (ACC_df['localtime'] < et_acc)]

        # Check if data meets minimum length requirements
        if len(ppg_win_data) < (window_size * 100) or (len(acc_win_data) / (window_size * 50)) < 0.95:
            st_ppg = et_ppg
            et_ppg = st_ppg + timedelta(seconds=window_size)
            st_acc = et_acc
            et_acc = st_acc + timedelta(seconds=window_size)
            continue

        # Downsample PPG data to 50 Hz to match ACC data frequency
        ppg_win_data = ppg_win_data.values[:window_size * 100][::2]

        if len(acc_win_data) >= window_size * 50:
            acc_win_data = acc_win_data[['x', 'y', 'z']].values[:window_size * 50]
        else:
            # Perform linear interpolation on ACC data
            orig_ts = pd.to_datetime(acc_win_data['localtime'].values)
            new_ts = pd.date_range(start=st_acc, end=et_acc, freq='20ms', inclusive='left')
            closest_before_idx = np.searchsorted(orig_ts, new_ts, side='right') - 1
            closest_before_idx = np.clip(closest_before_idx, 0, len(orig_ts) - 2)
            closest_after_idx = closest_before_idx + 1

            ts1 = orig_ts[closest_before_idx]
            ts2 = orig_ts[closest_after_idx]

            ts1_delta = (ts1 - pd.Timestamp(0)).total_seconds()
            ts2_delta = (ts2 - pd.Timestamp(0)).total_seconds()
            new_ts_delta = (new_ts - pd.Timestamp(0)).total_seconds()

            # Interpolate each axis ('x', 'y', 'z') separately
            resampled_acc_win_data_x = np.array([
                np.interp(new_ts_delta[i], [ts1_delta[i], ts2_delta[i]], [acc_win_data['x'].iloc[closest_before_idx[i]], acc_win_data['x'].iloc[closest_after_idx[i]]])
                for i in range(len(new_ts_delta))])

            resampled_acc_win_data_y = np.array([
                np.interp(new_ts_delta[i], [ts1_delta[i], ts2_delta[i]], [acc_win_data['y'].iloc[closest_before_idx[i]], acc_win_data['y'].iloc[closest_after_idx[i]]])
                for i in range(len(new_ts_delta))])

            resampled_acc_win_data_z = np.array([
                np.interp(new_ts_delta[i], [ts1_delta[i], ts2_delta[i]], [acc_win_data['z'].iloc[closest_before_idx[i]], acc_win_data['z'].iloc[closest_after_idx[i]]])
                for i in range(len(new_ts_delta))])

            acc_win_data = np.vstack([resampled_acc_win_data_x, resampled_acc_win_data_y, resampled_acc_win_data_z]).T

        # Combine PPG and ACC data
        win_combined = np.hstack((ppg_win_data, acc_win_data))

        # Update time window
        st_ppg = et_ppg
        et_ppg = st_ppg + timedelta(seconds=window_size)
        st_acc = et_acc
        et_acc = st_acc + timedelta(seconds=window_size)

        # Determine file name for saved data
        if window_size < 60:
            file_name = f"{user}/{day}/{day_hr}/{day_hr}:{win_min}:{win_sec}.npy"
        else:    
            file_name = f"{user}/{day}/{day_hr}/{day_hr}:{win_min}.npy"
        save_path = os.path.join(output_np_dir, file_name)
        
        # Ensure the directory exists and save the combined data
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        np.save(save_path, win_combined)

# Directories for PPG and ACC data
PPG_df_directory = os.path.join(os.getcwd(), 'dfs_ppg_100_days')
ACC_df_directory = os.path.join(os.getcwd(), 'dfs_acc_100_days')
output_np_dir = os.path.join(os.getcwd(), "PpgAcc_resampled_10sec_combined_100_days")
os.makedirs(output_np_dir, exist_ok=True)

window_size = 10
rows = [row for _, row in PPG_ACC_available.iterrows()]

# Process in parallel using joblib
list(
    tqdm(
        Parallel(return_as="generator", n_jobs=50)(
            delayed(combined_ppg_acc)(row, PPG_df_directory, ACC_df_directory, output_np_dir, window_size) 
            for row in tqdm(rows, desc="Processing ppg and acc hours")
        ),
        total=len(rows)
    )
)

##### Split 120 participants for train, val and test sets with a ratio of 70:15:15

In [7]:
np.random.seed(42)

# PpgAcc_resampled_combined_100_days has the data for 4-min windows
all_ptcs =  sorted(os.listdir(str(os.path.join(os.getcwd(), 'PpgAcc_resampled_combined_100_days'))))
all_ptcs_shf = np.random.permutation(all_ptcs).tolist()

n = len(all_ptcs_shf)
train_ratio = 0.7
val_ratio = 0.15

train_end = int(n * train_ratio)
val_end = train_end + int(n * val_ratio)

train_ptcs = all_ptcs_shf[:train_end]
val_ptcs = all_ptcs_shf[train_end:val_end]
test_ptcs = all_ptcs_shf[val_end:]

print(len(train_ptcs), len(val_ptcs), len(test_ptcs))

84 18 18


##### Divide participants' directories corresponding to train, test and val splits

In [None]:
from typing import Union

def create_data_directories(base_dir: Union[str, os.PathLike]) -> None:
    """
    Creates a base directory and subdirectories for training, validation, and testing data.
    
    Parameters:
    - base_dir (str or os.PathLike): The path to the base directory to be created. 
      Subdirectories 'train', 'val', and 'test' will be created within this directory.
      
    The function ensures that if the base directory does not exist, it is created, 
    along with the 'train', 'val', and 'test' subdirectories.
    """
    base_path = os.path.join(os.getcwd(), base_dir)

    # Create the base directory if it doesn't exist
    os.makedirs(base_path, exist_ok=True)

    # List of subdirectories to create within the base directory
    subdirs = ['train', 'val', 'test']

    # Create each subdirectory
    for subdir in subdirs:
        os.makedirs(os.path.join(base_path, subdir), exist_ok=True)

In [None]:
import shutil

def copy_folder_recursively(user: str, src: Union[str, os.PathLike], dst: Union[str, os.PathLike]) -> None:
    """
    Copies a folder and its contents recursively to a destination folder.

    Args:
    - user (str): The user identifier, used to specify subfolders.
    - src (Union[str, os.PathLike]): The source folder to copy.
    - dst (Union[str, os.PathLike]): The destination folder.
    """
    src = os.path.join(src, f"{user}")
    dst = os.path.join(dst, f"{user}")
    try:
        if not os.path.exists(dst):
            shutil.copytree(src, dst)
            print(f"Successfully copied {src} to {dst}")
        else:
            print(f"Destination folder {dst} already exists.")
    except OSError as e:
        print(f"Error copying {src} to {dst}: {e}")

In [None]:
from tqdm.notebook import tqdm
from typing import List

def copy_data(src: str, dst: str, participants: List[str], data_type: str, n_jobs: int = 50) -> None:
    """
    Copies folders for participants to a specified destination type (train, val, test) in a distributed manner.
    
    Parameters:
    - src (str): Source directory containing the data.
    - dst (str): Destination directory for the current data type.
    - participants (List[str]): List of participant identifiers.
    - data_type (str): The type of data ('train', 'val', 'test') for copying.
    - n_jobs (int): Number of parallel jobs to run. Default is 50.
    """
    list(
        tqdm(
            Parallel(return_as="generator", n_jobs=n_jobs)(
                delayed(copy_folder_recursively)(user, src, dst) 
                for user in tqdm(participants, desc=f"Creating {data_type} data")
            ),
            total=len(participants)
        )
    )


In [None]:
# Create train, val, test sub-directories for input window size of 10 seconds
src = os.path.join(os.getcwd(), "PpgAcc_resampled_10sec_combined_100_days")
base_dst = 'ppg_acc_np_10sec'

# Create necessary directories once
create_data_directories(base_dst)

# Paths for different data types
train_dst = os.path.join(os.getcwd(), base_dst, 'train')
val_dst = os.path.join(os.getcwd(), base_dst, 'val')
test_dst = os.path.join(os.getcwd(), base_dst, 'test')

# Copy data for each set
copy_data(src, train_dst, train_ptcs, 'train')
copy_data(src, val_dst, val_ptcs, 'val')
copy_data(src, test_dst, test_ptcs, 'test')

##### For each of the below cells, first execute combined_ppg_acc function with appropriate window size and destination folder for all users' data.

In [None]:
# Create train, val, test sub-directories for input window size of 1 minute
src = os.path.join(os.getcwd(), "PpgAcc_resampled_1min_combined_100_days")
base_dst = 'ppg_acc_np_1min'

# Create necessary directories once
create_data_directories(base_dst)

# Paths for different data types
train_dst = os.path.join(os.getcwd(), base_dst, 'train')
val_dst = os.path.join(os.getcwd(), base_dst, 'val')
test_dst = os.path.join(os.getcwd(), base_dst, 'test')

# Copy data for each set
copy_data(src, train_dst, train_ptcs, 'train')
copy_data(src, val_dst, val_ptcs, 'val')
copy_data(src, test_dst, test_ptcs, 'test')

In [None]:
# Create train, val, test sub-directories for input window size of 2 minutes
src = os.path.join(os.getcwd(), "PpgAcc_resampled_2min_combined_100_days")
base_dst = 'ppg_acc_np_2min'

# Create necessary directories once
create_data_directories(base_dst)

# Paths for different data types
train_dst = os.path.join(os.getcwd(), base_dst, 'train')
val_dst = os.path.join(os.getcwd(), base_dst, 'val')
test_dst = os.path.join(os.getcwd(), base_dst, 'test')

# Copy data for each set
copy_data(src, train_dst, train_ptcs, 'train')
copy_data(src, val_dst, val_ptcs, 'val')
copy_data(src, test_dst, test_ptcs, 'test')

In [None]:
# Create train, val, test sub-directories for input window size of 4 minutes
src = os.path.join(os.getcwd(), "PpgAcc_resampled_combined_100_days")
base_dst = 'ppg_acc_np'

# Create necessary directories once
create_data_directories(base_dst)

# Paths for different data types
train_dst = os.path.join(os.getcwd(), base_dst, 'train')
val_dst = os.path.join(os.getcwd(), base_dst, 'val')
test_dst = os.path.join(os.getcwd(), base_dst, 'test')

# Copy data for each set
copy_data(src, train_dst, train_ptcs, 'train')
copy_data(src, val_dst, val_ptcs, 'val')
copy_data(src, test_dst, test_ptcs, 'test')