In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import glob

%matplotlib inline
#%matplotlib qt
mpl.rcParams['lines.linewidth'] = 0.91
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib qt

from avro.datafile import DataFileReader
from avro.io import DatumReader

In [2]:
import numpy as np
import pandas as pd
import os

# Define path
save_data_path = "/Users/marcellosicbaldi/Library/CloudStorage/OneDrive-AlmaMaterStudiorumUniversitàdiBologna/tesi_Sara/Empatica/data/parquet/"

# Define subject and device
sub_ID = "00007"
device_ID = "3YK3J151VJ"

# Get all days (excluding hidden files)
days = sorted([day for day in os.listdir(save_data_path) if day[0] != "."])

# List to store processed data per day
day_data = []

for i, day in enumerate(days):
    print(f"Processing day: {day}")

    # Load data (assuming timestamps are already in the index)
    ppg = pd.read_parquet(save_data_path + day + "/ppg.parquet").sort_index()
    acc = pd.read_parquet(save_data_path + day + "/acc.parquet").sort_index()
    sys_peaks = pd.read_parquet(save_data_path + day + "/sys_peaks.parquet")
    sys_peaks.index = pd.to_datetime(sys_peaks["SysPeakTime"].astype(str))
    temp = pd.read_parquet(save_data_path + day + "/temp.parquet").sort_index()

    # Find the first timestamp for this day's data
    start_time = min(ppg.index.min(), acc.index.min(), sys_peaks["SysPeakTime"].min(), temp.index.min())

    # Define end time for this segment (24-hour window from start)
    end_time = start_time + pd.Timedelta(hours=24)

    # Append to list for processing
    day_data.append({'start': start_time, 'end': end_time, 'day': day, 
                     'ppg': ppg, 'acc': acc, 'sys_peaks': sys_peaks, 'temp': temp})

Processing day: 2024-05-20
Processing day: 2024-05-21
Processing day: 2024-05-22
Processing day: 2024-05-23
Processing day: 2024-05-24
Processing day: 2024-05-25
Processing day: 2024-05-26
Processing day: 2024-05-27
Processing day: day_1


KeyboardInterrupt: 

In [3]:
# Adjust last day's end time to cover till the final available timestamp
day_data[-1]['end'] = max(day_data[-1]['ppg'].index.max(),
                          day_data[-1]['acc'].index.max(),
                          day_data[-1]['sys_peaks'].values.max(),
                          day_data[-1]['temp'].index.max())

# Process and align data into 24-hour segments
aligned_days = []
for i, data in enumerate(day_data):
    start, end = pd.to_datetime(data['start']), pd.to_datetime(data['end'])

    # Filter data using index (since timestamps are already in the index)
    ppg_filtered = data['ppg'].loc[start:end]
    acc_filtered = data['acc'].loc[start:end]
    sys_peaks_filtered = data['sys_peaks'].loc[start:end]
    temp_filtered = data['temp'].loc[start:end]

    aligned_days.append({'start': start, 'end': end, 'day': data['day'], 
                         'ppg': ppg_filtered, 'acc': acc_filtered, 'sys_peaks': sys_peaks_filtered,
                         'temp': temp_filtered})

# Print results
for d in aligned_days:
    print(f"Day {d['day']} - Start: {d['start']}, End: {d['end']}, "
          f"PPG samples: {len(d['ppg'])}, ACC samples: {len(d['acc'])}, SYS_Peaks samples: {len(d['sys_peaks'])}")

Day 2024-05-20 - Start: 2024-05-20 13:02:55.523192, End: 2024-05-21 13:02:55.523192, PPG samples: 2998976, ACC samples: 2999168, SYS_Peaks samples: 52188
Day 2024-05-21 - Start: 2024-05-21 02:03:49.754983750, End: 2024-05-22 02:03:49.754983750, PPG samples: 5529303, ACC samples: 5529111, SYS_Peaks samples: 95325
Day 2024-05-22 - Start: 2024-05-22 02:05:41.135601500, End: 2024-05-23 02:05:41.135601500, PPG samples: 4556562, ACC samples: 4556690, SYS_Peaks samples: 76882
Day 2024-05-23 - Start: 2024-05-23 02:16:15.441449750, End: 2024-05-24 02:16:15.441449750, PPG samples: 5529132, ACC samples: 5529132, SYS_Peaks samples: 97243
Day 2024-05-24 - Start: 2024-05-24 02:18:41.385549500, End: 2024-05-25 02:18:41.385549500, PPG samples: 5529189, ACC samples: 5529203, SYS_Peaks samples: 83471
Day 2024-05-25 - Start: 2024-05-25 02:21:49.608278500, End: 2024-05-26 02:21:49.608278500, PPG samples: 3620480, ACC samples: 3621088, SYS_Peaks samples: 53169
Day 2024-05-26 - Start: 2024-05-26 02:13:50.86

In [6]:
aligned_days[0]["ppg"]

Unnamed: 0,ppg
2024-05-20 13:02:55.531981,0.000000
2024-05-20 13:02:55.547606,0.000008
2024-05-20 13:02:55.563231,0.000052
2024-05-20 13:02:55.578856,0.000184
2024-05-20 13:02:55.594481,0.000457
...,...
2024-05-21 02:03:54.339079,0.051028
2024-05-21 02:03:54.354704,0.046407
2024-05-21 02:03:54.370329,0.041619
2024-05-21 02:03:54.385954,0.036553


In [5]:
# Save days
for i, d in enumerate(aligned_days):
    os.makedirs(save_data_path + "day_" + str(i+1), exist_ok=True)
    # d['ppg'].to_parquet(save_data_path + "day_" + str(i+1) + "/ppg.parquet")
    # d['acc'].to_parquet(save_data_path + "day_" + str(i+1) + "/acc.parquet")
    # d['sys_peaks'].to_parquet(save_data_path + "day_" + str(i+1) + "/sys_peaks.parquet")
    d['temp'].to_parquet(save_data_path + "day_" + str(i+1) + "/temp.parquet")