## Reading AVRO files and non-wear detection

In [1]:
import numpy as np
import pandas as pd
import os
import glob

import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['lines.linewidth'] = 0.91
plt.style.use('seaborn-v0_8-whitegrid')

%matplotlib qt

### Read data
~ 2 min for reading and concatenating into a list for 7 days of data

**With Pandas**

- ~ 2.30 min for transforming it into a pd.DataFrame
- ~ 1.30 min for saving to accelerometer to csv
- ~ 3 sec for saving ppg, acc, and temp to parquet

**With Polars**

- TODO

In [2]:
from data_io.embraceplus.read_avro_mod import ReadEmpaticaAvro
empatica_reader = ReadEmpaticaAvro()

In [3]:
#### Change the paths below to the location of the data on your machine ####
data_path = "/Users/augenpro/Documents/Empatica/data_sara/data/participant_data/"

#### Change the subject ID and device ID below to the subject and device you want to process ####
sub_ID = "00007"
device_ID = "3YK3J151VJ"

days = sorted(os.listdir(data_path))
days = [day for day in days if day[0] != "."] # remove hidden files (needed for MacOS users)

acc = []
ppg = []
temp = []
time = []
time_temp = []
tags = []

for i, day in enumerate(days):
    
    print(f"Processing day {i+1}/{len(days)}")

    folder_day = data_path + day + "/" + sub_ID + "-" + device_ID + "/raw_data/v6"

    avro_files = sorted(glob.glob(folder_day + "/*.avro"))

    for avro_file in avro_files:
        
        data = empatica_reader.read(file=avro_file)

        acc.extend(data["acc"])

        ppg.extend(data["bvp"])

        time.extend(data["time"]) # Same for acc and ppg

        temp.extend(data["temp"])
        time_temp.extend(data["time_temp"])

        tags.extend(data["tags"])

    break

Processing day 1/8


In [4]:
tags

[1716210262.022092, 1716253853.037992]

In [4]:
acc_df = pd.DataFrame(acc, columns=["x", "y", "z"], index = pd.to_datetime(time, unit="s")).sort_index()
ppg_df = pd.DataFrame(ppg, columns=["ppg"], index = pd.to_datetime(time, unit="s")).sort_index()
temp_df = pd.DataFrame(temp, columns=["temp"], index = pd.to_datetime(time_temp, unit="s")).sort_index()

### Detect non-wear

In [10]:
# I need to divide it into portions when the device was in charge

t_charge_end = acc_df.index[acc_df.index.to_series().diff().dt.total_seconds() > 60*10] # if the difference between two consecutive timestamps is more than 10 minutes (**), it means the device was in charge
t_charge_start = acc_df.index[np.where(acc_df.index.to_series().diff().dt.total_seconds() > 60*10)[0]-1] # the start of the charge is the timestamp before the end of the charge
t_charge = pd.DataFrame({"start": t_charge_start, "end": t_charge_end})

good_portions = pd.DataFrame(columns=["start", "end"]) # I will store the portions when the device was not in charge here
good_portions["start"] = t_charge["end"].iloc[:-1].reset_index(drop=True)
good_portions["end"] = t_charge["start"].iloc[1:].reset_index(drop=True)
start_first_charge = t_charge["start"].iloc[0]
end_last_charge = t_charge["end"].iloc[-1]

# Segment the data into portions when the device was not in charge and perform nonwear detection
# Add the first portion
acc_df_portions = [acc_df[:start_first_charge]]
ppg_df_portions = [ppg_df[:start_first_charge]]
temp_df_portions = [temp_df[:start_first_charge]]

for i, row in good_portions.iterrows():

    if row["end"] - row["start"] < pd.Timedelta("10 min"): # if the portion is less than 10 minutes (**), skip it
        continue

    acc_df_portions.append(acc_df[row["start"]:row["end"]])
    ppg_df_portions.append(ppg_df[row["start"]:row["end"]])
    temp_df_portions.append(temp_df[row["start"]:row["end"]])

# Add the last portion
acc_df_portions.append(acc_df[end_last_charge:])
ppg_df_portions.append(ppg_df[end_last_charge:])
temp_df_portions.append(temp_df[end_last_charge:])

In [11]:
from nonwear.DETACH import nimbaldetach

In [12]:
# for each of them, perform NW detection
acc_df_cleaned = []
temp_df_cleaned = []
ppg_df_cleaned = []
for i, (acc, temp, ppg) in enumerate(zip(acc_df_portions, temp_df_portions, ppg_df_portions)):

    start_stop_nw, _ = nimbaldetach(acc['x'].values, acc['y'].values, acc['z'].values, temp["temp"].values, accel_freq=64, temperature_freq=1, quiet=True)

    # Remove non-wear periods
    for i, row in start_stop_nw.iterrows():
        datetime_start_nw = acc.index[row["Start Datapoint"]]
        datetime_end_nw = acc.index[row["End Datapoint"]]
        acc.loc[datetime_start_nw:datetime_end_nw] = np.nan
        temp.loc[datetime_start_nw:datetime_end_nw] = np.nan
        ppg.loc[datetime_start_nw:datetime_end_nw] = np.nan
    acc_portion = acc.dropna()
    temp_portion = temp.dropna()
    ppg_portion = ppg.dropna()

    acc_df_cleaned.append(acc_portion)
    temp_df_cleaned.append(temp_portion)
    ppg_df_cleaned.append(ppg_portion)

acc_df_cleaned = pd.concat(acc_df_cleaned)
temp_df_cleaned = pd.concat(temp_df_cleaned)
ppg_df_cleaned = pd.concat(ppg_df_cleaned)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acc.loc[datetime_start_nw:datetime_end_nw] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.loc[datetime_start_nw:datetime_end_nw] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ppg.loc[datetime_start_nw:datetime_end_nw] = np.nan


In [9]:
temp_df.shape, acc_df.shape

((553410, 1), (35417280, 3))

In [13]:
save_data_path = "/Users/augenpro/Documents/Empatica/data_sara/data/GGIR_input"
acc_df_parquet = pd.read_parquet(save_data_path + "/acc.parquet") * 1000
input_GGIR_path = "/Users/augenpro/Documents/Empatica/data_sara/data/GGIR_input_new"

acc_df_GGIR = acc_df_parquet.copy()
t_gaps_start = acc_df_GGIR.index[acc_df_GGIR.index.to_series().diff().dt.total_seconds() > 60*1] # if the difference between two consecutive timestamps is more than 1 minute, it means there is a gap
t_gaps_end = acc_df_GGIR.index[np.where(acc_df_GGIR.index.to_series().diff().dt.total_seconds() > 60*1)[0]-1] # the start of the gap is the timestamp before the end of the gap
t_gaps = pd.DataFrame({"start": t_gaps_end, "end": t_gaps_start})
for i, row in t_gaps.iterrows():
    acc_df_GGIR = acc_df_GGIR.loc[acc_df_GGIR.index < row["start"]]
    acc_df_GGIR = pd.concat([acc_df_GGIR, pd.DataFrame(index=pd.date_range(start=row["start"], end=row["end"], freq=f"{1/64} s"))])
    acc_df_GGIR = pd.concat([acc_df_GGIR, acc_df_GGIR.loc[acc_df_GGIR.index > row["end"]]])
acc_df_GGIR.to_csv(input_GGIR_path + "/acc_GGIR.csv")

In [None]:
save_data_path = "/Users/augenpro/Documents/Empatica/data_sara/data/GGIR_input"
input_GGIR_path = "/Users/augenpro/Documents/Empatica/data_sara/data/GGIR_input_new"

# Save to csv for GGIR - I need to fill gaps with NaNs
acc_df_GGIR = acc_df.copy() * 1000 # convert to mg
t_gaps_start = acc_df_GGIR.index[acc_df_GGIR.index.to_series().diff().dt.total_seconds() > 60*1] # if the difference between two consecutive timestamps is more than 1 minute, it means there is a gap
t_gaps_end = acc_df_GGIR.index[np.where(acc_df_GGIR.index.to_series().diff().dt.total_seconds() > 60*1)[0]-1] # the start of the gap is the timestamp before the end of the gap
t_gaps = pd.DataFrame({"start": t_gaps_end, "end": t_gaps_start})
for i, row in t_gaps.iterrows():
    acc_df_GGIR = acc_df_GGIR.loc[acc_df_GGIR.index < row["start"]]
    acc_df_GGIR = pd.concat([acc_df_GGIR, pd.DataFrame(index=pd.date_range(start=row["start"], end=row["end"], freq=f"{1/64} s"))])
    acc_df_GGIR = pd.concat([acc_df_GGIR, acc_df_GGIR.loc[acc_df_GGIR.index > row["end"]]])
acc_df_GGIR.to_csv(input_GGIR_path + "/acc_GGIR.csv")

# Save to parquet for further analysis
# acc_df_cleaned.to_parquet(save_data_path + "/acc.parquet")
# temp_df_cleaned.to_parquet(save_data_path + "/temp.parquet")
# ppg_df_cleaned.to_parquet(save_data_path + "/ppg.parquet")

In [4]:
input_GGIR_path = "/Users/augenpro/Documents/Empatica/data_sara/data/GGIR_input_new"
acc_df_GGIR = pd.read_csv(input_GGIR_path + "/acc_GGIR.csv")
acc_df_GGIR

Unnamed: 0.1,Unnamed: 0,x,y,z
0,2024-05-20 13:02:55.531980990,-68.359375,408.691406,931.152344
1,2024-05-20 13:02:55.547605991,-68.359375,403.320312,930.175781
2,2024-05-20 13:02:55.563230991,-71.289062,397.949219,919.433594
3,2024-05-20 13:02:55.578855991,-69.824219,400.878906,925.781250
4,2024-05-20 13:02:55.594480991,-64.453125,409.667969,923.339844
...,...,...,...,...
38267333,2024-05-27 11:08:55.250720977,0.006348,-0.012207,1.010254
38267334,2024-05-27 11:08:55.266345977,0.009277,-0.011719,1.010254
38267335,2024-05-27 11:08:55.281970978,0.016602,-0.010254,1.009277
38267336,2024-05-27 11:08:55.297595978,0.011719,-0.010254,1.010254


In [14]:
plt.figure()
plt.plot(acc_df_GGIR[["x", "y", "z"]])

[<matplotlib.lines.Line2D at 0x14c83e510>,
 <matplotlib.lines.Line2D at 0x17c8a5f40>,
 <matplotlib.lines.Line2D at 0x17c8a6120>]

In [6]:
save_data_path = "/Users/augenpro/Documents/Empatica/data_sara/data/GGIR_input_new"
acc_df = pd.read_csv(save_data_path + "/acc_new.csv", index_col=0)

In [None]:
from visualization.check_accelerometer_temp import plot_acc_temp

# Decide whether to plot in notebook or in a separate window
from bokeh.plotting import output_notebook
output_notebook()

processed_data_path = save_data_path
plot_acc_temp(processed_data_path, accSMV = False)

### Load GGIR output - work in progress

In [1]:
output_GGIR_path = "/Users/augenpro/Documents/Empatica/data_sara/data/GGIR_output_new/output_GGIR_input_new/results/QC/"

output_GGIR = pd.read_csv(output_GGIR_path + "part4_nightsummary_sleep_full.csv")

SPT = []

for i, day_row in output_GGIR.iterrows():
    # Stupid thing to get the correct datetime for segmenting signals into day and night (but no alternatives I guess)
    if output_GGIR["sleeponset_ts"].iloc[0][0] == '0':
        sleep_onset = pd.to_datetime(str(pd.to_datetime(day_row["calendar_date"]).date() + pd.Timedelta("1d")) + " " + day_row["sleeponset_ts"])
    else:
        sleep_onset = pd.to_datetime(pd.to_datetime(day_row["calendar_date"]).date() + " " + day_row["sleeponset_ts"])

    wake_onset = pd.to_datetime(str(pd.to_datetime(day_row["calendar_date"]).date() + pd.Timedelta("1d")) + " " + day_row["wakeup_ts"])

    SPT.append((sleep_onset, wake_onset))

start_end_sleep = np.array(SPT).reshape(-1, 2)
SPT_GGIR = pd.DataFrame(start_end_sleep, columns=["start", "end"])
SPT_GGIR

NameError: name 'pd' is not defined