# AGE-it - Exploratory Data Analysis

- Summary of the recordings - how many hours/days for each sensor?

### Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import glob

%matplotlib inline
#%matplotlib qt
mpl.rcParams['lines.linewidth'] = 0.91
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib qt

from avro.datafile import DataFileReader
from avro.io import DatumReader

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Slider, Select
from bokeh.layouts import gridplot
from bokeh.models import Range1d
from bokeh.io import export_png
from bokeh.models import DatetimeTickFormatter

## GENEActiv
Files are already converted into .bin with the bin2parquet notebook

In [None]:
data_path = "/Users/augenpro/Documents/Age-IT/data/Gold/" # path to the folder containing the subjects
participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
visit = "T0 (baseline)" # T0 (baseline), T1 (follow-up @ 6 months), T2 (follow-up @ 12 months)

# participants = participants[5:6] # select the first participant

sensors = ["GeneActivPolso", "GeneActivCaviglia"]#, "RootiRx"]

geneactiv = {}

for participant in participants:
    print(participant)
    for sensor in sensors:
        # print(sensor)
        # Load the data
        path = os.path.join(data_path, participant, visit, sensor)
        files = os.listdir(os.path.join(data_path, participant, visit, sensor))
        # if there are already parquet files, skip
        for f in files:
            if f.endswith(".parquet"):
                # print(f)
                acc_df = pd.read_parquet(os.path.join(path, f))
                # geneactiv[sensor] = acc_df
                if len(acc_df) == 0:
                    print(f"{sensor}: is empty")
                else:
                    print(f"{sensor}:  {acc_df.index[-1] - acc_df.index[0]}".split(".")[0])


            break
        break
    break

    print("")
# # Plot the data
# p = figure(plot_width=800, plot_height=400, x_axis_type="datetime", title=f"{participant} - {sensor} - {visit}")
# p.line(data["timestamp"], data["x"], line_width=2, legend_label="x", color="blue")
# p.line(data["timestamp"], data["y"], line_width=2, legend_label="y", color="green")
# p.line(data["timestamp"], data["z"], line_width=2, legend_label="z", color="red")
# p.xaxis.formatter = DatetimeTickFormatter(days="%d/%m %H:%M", hours="%H:%M")
# p.legend.location = "top_left"
# p.legend.click_policy="hide"
# show(p)
# export_png(p, filename=f"{participant}_{sensor}_{visit}.png")
# print(f"{participant}_{sensor}_{visit}.png")
# print("")

08623
GeneActivPolso:  7 days 01:20:20
GeneActivCaviglia:  7 days 01:19:44

08667

14219
GeneActivPolso:  6 days 22:27:29
GeneActivCaviglia:  6 days 22:27:26

20603

23483
GeneActivPolso:  6 days 23:38:14
GeneActivCaviglia:  6 days 23:38:20

36644
GeneActivPolso:  6 days 22:46:32
GeneActivCaviglia:  6 days 22:47:14

36765

36920
GeneActivPolso:  5 days 23:55:05
GeneActivCaviglia:  5 days 23:54:05

58319
GeneActivPolso:  0 days 18:23:38
GeneActivCaviglia:  7 days 01:19:05

59794


: 

In [None]:
# Recording duration
for sensor, data in geneactiv.items():
    print(f"{sensor}: {data.index[-1] - data.index[0]}".split(".")[0])

GeneActivPolso: 6 days 22:46:32
GeneActivCaviglia: 6 days 22:47:14


In [None]:
# geneactiv["GeneActivPolso"]["acc_SMV"] = np.sqrt(geneactiv["GeneActivPolso"]["x"]**2 + geneactiv["GeneActivPolso"]["y"]**2 + geneactiv["GeneActivPolso"]["z"]**2)
# geneactiv["GeneActivCaviglia"]["acc_SMV"] = np.sqrt(geneactiv["GeneActivCaviglia"]["x"]**2 + geneactiv["GeneActivCaviglia"]["y"]**2 + geneactiv["GeneActivCaviglia"]["z"]**2)

plt.figure(figsize = (19,11))
plt.subplot(2, 1, 1)
plt.plot(geneactiv["GeneActivPolso"]["acc_SMV"].iloc[:1000000], label = "GeneActivPolso")
plt.plot(geneactiv["GeneActivCaviglia"]["acc_SMV"].iloc[:1000000], label = "GeneActivCaviglia")
plt.title("Recording Start", fontsize = 21)
plt.legend()
plt.subplot(2, 1, 2)
plt.plot(geneactiv["GeneActivPolso"]["acc_SMV"].iloc[-1000000:], label = "GeneActivPolso")
plt.plot(geneactiv["GeneActivCaviglia"]["acc_SMV"].iloc[-1000000:], label = "GeneActivCaviglia")
plt.title("Recording End", fontsize = 21)
plt.legend()

<matplotlib.legend.Legend at 0x163814170>

## Verity Sense

In [None]:
from data_io.veritysense.convert_polar import process_polar

In [None]:
import fastavro

def read_avro_veritysense(file_path, offset_vs = 946684800000000000+ 3600 * 1e9):
    """
    Reads an Avro file and returns the data as a pandas DataFrame.
    """
    with open(file_path, "rb") as avro_file:
        # Use fastavro.reader to read the Avro file
        reader = fastavro.reader(avro_file)
        # Convert the records to a list
        records = [record for record in reader]
       # to dataframe
        df = pd.DataFrame(records)
        df.index = pd.to_datetime(df["timestamp"] + offset_vs, unit="ns") 
        df.drop("timestamp", axis=1, inplace=True)
    return df / 1000

In [None]:
import re
from datetime import datetime

# Function to extract full datetime (including time) for sorting
def extract_datetime(filename):
    match = re.search(r'(\w{3}) (\w{3}) (\d{2}) (\d{2})-(\d{2})-(\d{2}) .* (\d{4})', filename)
    if match:
        weekday, month, day, hour, minute, second, year = match.groups()
        date_str = f"{day} {month} {year} {hour}:{minute}:{second}"
        date_obj = datetime.strptime(date_str, "%d %b %Y %H:%M:%S")  # Convert to datetime object
        return date_obj
    return datetime.max  # Default to a max value if parsing fails

In [None]:
data_path = "/Users/augenpro/Documents/Age-IT/data/Gold/" # path to the folder containing the subjects
participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
visit = "T0 (baseline)" # T0 (baseline), T1 (follow-up @ 6 months), T2 (follow-up @ 12 months)

# participants = participants[0:1] # select the first participant

participants = ["23483"]

sensor = "VeritySense"#, "GeneActivPolso", "GeneActivCaviglia", "RootiRx"]

for participant in participants:
    acc_df = pd.DataFrame()
    ppg_df = pd.DataFrame()
    # print(sensor)
    path = os.path.join(data_path, participant, visit, sensor)
    files_in_path = [f for f in os.listdir(path) if not f.startswith(".")]
    if len(files_in_path) <= 2: # "_" and "AVRO"
        continue

    print(f"**************** {participant} ****************")
    acc_path = os.path.join(path, "AVRO/acc")
    ppg_path = os.path.join(path, "AVRO/ppg")
    for f in sorted(os.listdir(acc_path), key=extract_datetime):
        current_acc = read_avro_veritysense(os.path.join(acc_path, f))
        acc_df = pd.concat([acc_df, current_acc])

    for f in sorted(os.listdir(ppg_path), key=extract_datetime):
        current_ppg = read_avro_veritysense(os.path.join(ppg_path, f))
        ppg_df = pd.concat([ppg_df, current_ppg])

    t_acc_rec = acc_df.index[acc_df.index.to_series().diff().dt.total_seconds() > 0.5]
    t_acc_disc = acc_df.index[np.where(acc_df.index.to_series().diff().dt.total_seconds() > 0.5)[0]-1]
    t_disc_rec_acc = pd.DataFrame({"disc": t_acc_disc, "rec": t_acc_rec})
    total_duration_acc = acc_df.index[-1] - acc_df.index[0]
    disconnetions_duration_acc = t_disc_rec_acc["rec"] - t_disc_rec_acc["disc"]
    effective_duration_acc = total_duration_acc - disconnetions_duration_acc.sum()
    if len(t_acc_disc) > 0:
        print("\n ***** Disconnections in ACC *****")
        # for i in range(len(t_acc_disc)):
        #     print(f"Disconnection {i+1}:")
        #     print(f"Start: {t_acc_disc[i].strftime('%Y-%m-%d %H:%M:%S')}")
        #     print(f"End: {t_acc_rec[i].strftime('%Y-%m-%d %H:%M:%S')}")
        #     print(f"Duration: {str(t_acc_rec[i] - t_acc_disc[i]).split(' ')[2][3:5]} min and {str(t_acc_rec[i] - t_acc_disc[i]).split(' ')[2][6:8]} s")
        print("Number of disconnections: " + str(len(t_acc_disc)))
        print("Total duration of disconnections: " + str(np.sum([t_acc_rec[i] - t_acc_disc[i] for i in range(len(t_acc_disc))])).split(' ')[2][:8] + " (hours, minutes and seconds)")
        print(f"ACC effective duration: {effective_duration_acc}".split(".")[0])
    # Fill disconnection portions with NaNs and linearly interpolate
    for start, end in zip(t_acc_disc, t_acc_rec):
        acc_df.loc[start:end] = np.nan
    acc_df = acc_df.interpolate(method='time')

    t_ppg_rec = ppg_df.index[ppg_df.index.to_series().diff().dt.total_seconds() > 1]
    t_ppg_disc = ppg_df.index[np.where(ppg_df.index.to_series().diff().dt.total_seconds() > 1)[0]-1]
    t_disc_rec_ppg = pd.DataFrame({"disc": t_ppg_disc, "rec": t_ppg_rec})
    total_duration_ppg = ppg_df.index[-1] - ppg_df.index[0]
    disconnetions_duration_ppg = t_disc_rec_ppg["rec"] - t_disc_rec_ppg["disc"]
    effective_duration_ppg = total_duration_ppg - disconnetions_duration_ppg.sum()
    if len(t_ppg_disc) > 0:
        print("\n ***** Disconnections in PPG *****")
        # for i in range(len(t_ppg_disc)):
        #     print(f"Disconnection {i+1}:")
        #     print(f"Start: {t_ppg_disc[i].strftime('%Y-%m-%d %H:%M:%S')}")
        #     print(f"End: {t_ppg_rec[i].strftime('%Y-%m-%d %H:%M:%S')}")
        #     print(f"Duration: {str(t_ppg_rec[i] - t_ppg_disc[i]).split(' ')[2][3:5]} min and {str(t_ppg_rec[i] - t_ppg_disc[i]).split(' ')[2][6:8]} s")
        print("Number of disconnections: " + str(len(t_ppg_disc)))
        print("Total duration of disconnections: " + str(np.sum([t_ppg_rec[i] - t_ppg_disc[i] for i in range(len(t_ppg_disc))])).split(' ')[2][:8] + " (hours, minutes and seconds)")
        print(f"PPG effective duration: {effective_duration_ppg}".split(".")[0])
        print("")
    # Fill disconnection portions with NaNs and linearly interpolate
    for start, end in zip(t_ppg_disc, t_ppg_rec):
        ppg_df.loc[start:end] = np.nan
    ppg_df = ppg_df.interpolate(method='time')

**************** 23483 ****************

 ***** Disconnections in ACC *****
Number of disconnections: 134
Total duration of disconnections: 06:41:26 (hours, minutes and seconds)
ACC effective duration: 4 days 16:50:17

 ***** Disconnections in PPG *****
Number of disconnections: 137
Total duration of disconnections: 06:11:44 (hours, minutes and seconds)
PPG effective duration: 4 days 17:19:59



##### Soggetto 97060 ------- weird timestamps starting at 1970

In [None]:
total_dur = acc_df.index[-1] - acc_df.index[0]
disconnetions_dur = t_disc_rec_acc["rec"] - t_disc_rec_acc["disc"]
effective_dur = total_dur - disconnetions_dur.sum()
total_dur, disconnetions_dur.sum(), effective_dur

(Timedelta('6 days 23:31:43.876258560'),
 Timedelta('2 days 06:41:26.693904128'),
 Timedelta('4 days 16:50:17.182354432'))

In [None]:
for start, end in zip(t_acc_disc, t_acc_rec):
    print(start,end)
    break

ppg_df.loc[start:end]

2025-01-21 16:17:29.975070208 2025-01-21 16:20:03.052938240


Unnamed: 0_level_0,ppg1,ppg2,ppg3,ambient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [None]:
ppg_df.loc[pd.Timestamp("2025-01-21 16:20:03.052938240"):pd.Timestamp("2025-01-22 16:20:03.052938240")].to_csv("ppg_df.csv")

In [None]:
ppg = pd.read_pickle("ppg_df.pkl")
ppg

Unnamed: 0_level_0,ppg1,ppg2,ppg3,ambient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-01-21 16:20:03.344035328,115.751091,19.832478,75.259832,-291.321995
2025-01-21 16:20:03.362216960,115.752000,19.834000,75.260000,-291.322000
2025-01-21 16:20:03.380399104,115.551000,19.657000,75.209000,-291.231000
2025-01-21 16:20:03.398580736,115.589000,19.481000,75.181000,-291.230000
2025-01-21 16:20:03.416762624,115.991000,19.511000,75.189000,-291.263000
...,...,...,...,...
2025-01-22 16:20:02.974105088,63.964000,22.219000,25.433000,-291.274000
2025-01-22 16:20:02.992358400,65.248000,23.506000,26.564000,-291.263000
2025-01-22 16:20:03.010611712,66.445000,24.733000,27.541000,-291.279000
2025-01-22 16:20:03.028865024,67.564000,25.856000,28.451000,-291.254000


In [None]:
import neurokit2 as nk
ppg_df["ppg_filt"] = -nk.signal_filter(ppg_df["ppg1"], lowcut=0.5, highcut=8, sampling_rate=55)

In [None]:
plt.figure(figsize = (19,11))
plt.subplot(2, 1, 1)
plt.plot(acc_df)
plt.title("Acceleration")
# plt.legend(acc_df.columns)
plt.subplot(2, 1, 2, sharex = plt.subplot(2, 1, 1))
plt.plot(ppg_df["ppg1"])
plt.title("PPG")
# plt.legend(["ppg_filt"])

Text(0.5, 1.0, 'PPG')

In [None]:
# Calculate the actual recording duration by summing up the differences between consecutive timestamps
time_diffs = acc_df.index.to_series().diff().dropna()
overall_duration = time_diffs.sum()
print(f"Overall duration of the recording (excluding gaps): {overall_duration}")

Overall duration of the recording (excluding gaps): 0 days 05:29:49.749643520


In [None]:
ppg_df.index[-1] - ppg_df.index[0]

Timedelta('0 days 05:29:53.340454144')

## Rooti

In [None]:
data_path = "/Users/augenpro/Documents/Age-IT/data/Gold/" # path to the folder containing the subjects
participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
visit = "T0 (baseline)" # T0 (baseline), T1 (follow-up @ 6 months), T2 (follow-up @ 12 months)
participants = ["14219"]
# participants = participants[5:6] # select the first participant

sensors = ["RootiRx"]#, "RootiRx"]

sensor = sensors[0]

rootirx = {}

for participant in participants:
    path = os.path.join(data_path, participant, visit, sensor)
    files = os.listdir(os.path.join(data_path, participant, visit, sensor))

    files = [f for f in files if f == participant]

    if len(files) == 0:
        # print("No RootiRx data")
        continue
    
    print("***** " + participant + " *****")
    rooti_path = os.path.join(data_path, participant, visit, sensor, files[0], "measure")

    raw_acc_path = [x[0] for x in os.walk(rooti_path) if x[0].endswith("/GSENSOR")][0]

    # print(raw_acc_path)

    acc_files = sorted(os.listdir(raw_acc_path))

    # print(len(acc_files))

    acc = pd.DataFrame()

    acc_first = pd.read_csv(raw_acc_path + '/' + acc_files[0], compression='zip', index_col = 0, header = None, names = ["packet", "acc_x", "acc_y", "acc_z", "idk", "idk2"])
    acc_first_start = pd.to_datetime(acc_first.index[0], unit = "s")

    acc_last = pd.read_csv(raw_acc_path + '/' + acc_files[-1], compression='zip', index_col = 0, header = None, names = ["packet", "acc_x", "acc_y", "acc_z", "idk", "idk2"])
    acc_last_end = pd.to_datetime(acc_last.index[-1], unit = "s")

    print(f"Total duration: {acc_last_end - acc_first_start}")

    for i, acc_file in enumerate(acc_files):
        acc1 = pd.read_csv(raw_acc_path + '/' + acc_file, compression='zip', index_col = 0, header = None, names = ["packet", "acc_x", "acc_y", "acc_z", "idk", "idk2"])
        acc1 = acc1[["acc_x", "acc_y", "acc_z"]]
        acc1.index = pd.date_range(start = pd.to_datetime(acc1.index[0], unit = "s"), periods = len(acc1), freq = "0.032s")
        acc = pd.concat([acc, acc1])

    # print(path)

    print("")
    break

    # print(files)

***** 14219 *****
Total duration: 6 days 23:49:20



In [20]:
plt.figure(figsize = (19,11))
plt.plot(np.diff(acc.index).astype('timedelta64[s]'))

[<matplotlib.lines.Line2D at 0x1585d9730>]

In [None]:
acc1.head()

Unnamed: 0,packet,acc_x,acc_y,acc_z,idk,idk2
2025-01-17 10:34:21.000000,0,176,-191,1288,18,166
2025-01-17 10:34:21.003200,1,150,-183,1324,18,166
2025-01-17 10:34:21.006400,2,150,119,1187,18,166
2025-01-17 10:34:21.009600,3,115,-164,1379,18,166
2025-01-17 10:34:21.012800,4,60,-287,1468,18,166


# Synch between GENEActiv and Rooti

In [2]:
data_path = "/Users/augenpro/Documents/Age-IT/data/Gold" # path to the folder containing the subjects
participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
visit = "T0 (baseline)" # T0 (baseline), T1 (follow-up @ 6 months), T2 (follow-up @ 12 months)

# participants = participants[5:6] # select the first participant

sensors = ["GeneActivPolso", "GeneActivCaviglia"]#, "RootiRx"]

geneactiv = {}

participants = ["14219"]

for participant in participants:
    print(participant)
    for sensor in sensors:
        path = os.path.join(data_path, participant, visit, sensor)
        files = os.listdir(os.path.join(data_path, participant, visit, sensor))
        # if there are already parquet files, skip
        for f in files:
            if f.endswith(".parquet"):
                # print(f)
                acc_gen = pd.read_parquet(os.path.join(path, f))
                # geneactiv[sensor] = acc_gen
                if len(acc_gen) == 0:
                    print(f"{sensor}: is empty")
                else:
                    print(f"{sensor}:  {acc_gen.index[-1] - acc_gen.index[0]}".split(".")[0])
                if sensor == "GeneActivPolso":
                    geneactiv["wrist"] = acc_gen
                elif sensor == "GeneActivCaviglia":
                    geneactiv["ankle"] = acc_gen
                del acc_gen
    print("")

# # Rooti
# sensor = "RootiRx"
# for participant in participants:
#     path = os.path.join(data_path, participant, visit, sensor)
#     files = os.listdir(os.path.join(data_path, participant, visit, sensor))

#     files = [f for f in files if f == participant]

#     if len(files) == 0:
#         # print("No RootiRx data")
#         continue
    
#     print("***** " + participant + " *****")
#     rooti_path = os.path.join(data_path, participant, visit, sensor, files[0], "measure")

#     raw_acc_path = [x[0] for x in os.walk(rooti_path) if x[0].endswith("/GSENSOR")][0]
#     acc_files = sorted(os.listdir(raw_acc_path))
#     acc_rooti = pd.DataFrame()
#     t_acc_rooti = pd.Series()
#     first_acc_file = pd.read_csv(raw_acc_path + '/' + acc_files[0], compression='zip', index_col = 0, header = None)
#     t_start_rooti = pd.to_datetime(first_acc_file.index[0], unit = "s")
#     for i, acc_file in enumerate(acc_files):
#         acc1 = pd.read_csv(raw_acc_path + '/' + acc_files[i], compression='zip', index_col = 0, header = None) / 1000 # convert to g
#         t_acc_start = pd.to_datetime(acc1.index[0], unit = "s")
#         t_acc_end = pd.to_datetime(acc1.index[-1]+1, unit = "s")
#         acc1 = acc1.iloc[:, [1, 2, 3]].reset_index(drop = True)
#         acc1.columns = ["acc_x", "acc_y", "acc_z"]
#         t_acc1 = pd.date_range(start = t_acc_start, end = t_acc_end, periods=len(acc1)+1)[:-1]
#         acc_rooti = pd.concat([acc_rooti, acc1])
#         t_acc_rooti = pd.concat([t_acc_rooti, t_acc1.to_series()])

#     ecg_filtered_path = [x[0] for x in os.walk(rooti_path) if x[0].endswith("/FilteredECG")][0] + "/250"
#     ecg_filtered_files = sorted(os.listdir(ecg_filtered_path))
#     ecg_dataframes = []

#     # Iterate through ecg filtered files
#     for i, ecg_file in enumerate(ecg_filtered_files):
#         ecg_filtered_df = pd.read_csv(
#             ecg_filtered_path + '/' + ecg_file, 
#             compression='zip', 
#             header=None, 
#             names=["ecg", "Rpeaks"]
#         )
#         # t_start = pd.to_datetime(ecg_filtered_files[0].split(".")[0], unit = "s")
#         # ecg_filtered_df.index = pd.date_range(start = t_start, periods = len(ecg_filtered_df), freq = "0.004s")
#         ecg_dataframes.append(ecg_filtered_df)
#     t_start_ecg = pd.to_datetime(int(ecg_filtered_files[0].split(".")[0]), unit = "s")
#     ecg_filtered = pd.concat(ecg_dataframes, axis=0)
#     ecg_filtered.index = pd.date_range(start = t_start_ecg, periods = len(ecg_filtered), freq = "0.004s")

#     # print(path)

#     print("")

14219
GeneActivPolso:  6 days 22:27:29
GeneActivCaviglia:  6 days 22:27:26



In [3]:
from utils.compute_acc_SMV import compute_acc_SMV
geneactiv["wrist"]["acc_SMV"] = compute_acc_SMV(geneactiv["wrist"])

In [5]:
plt.figure(figsize=(19,11))
plt.plot(geneactiv["wrist"]["acc_SMV"].resample("1min").mean(), label = "Wrist")

[<matplotlib.lines.Line2D at 0x16a2900b0>]

In [None]:
ecg_filtered.to_parquet(ecg_filtered_path + "/ecg_filtered.parquet")

In [None]:
# plt.figure()
# plt.plot(t_acc_rooti.diff().dropna().dt.total_seconds())

plt.figure()
plt.plot(ecg_filtered.index.diff().dropna().total_seconds())

In [4]:
ecg_filtered.head()

Unnamed: 0,ecg,Rpeaks
2025-01-17 10:34:21.000,16.445,
2025-01-17 10:34:21.004,5.801,
2025-01-17 10:34:21.008,-1.034,
2025-01-17 10:34:21.012,-2.867,
2025-01-17 10:34:21.016,-1.026,


In [5]:
ecg_filtered["ecg"].index[0], t_acc_rooti.iloc[0]

(Timestamp('2025-01-24 10:14:21'), Timestamp('2025-01-17 10:34:21'))

In [10]:
plt.figure(figsize = (19,11))
plt.subplot(2, 1, 1)
plt.plot(ecg_filtered["ecg"].loc[pd.Timestamp("2025-01-17 11:34:21"):pd.Timestamp("2025-01-18 11:34:21") + pd.Timedelta(hours = 1)])
plt.subplot(2, 1, 2, sharex = plt.subplot(2, 1, 1))
plt.plot(acc_rooti.loc[pd.Timestamp("2025-01-17 11:34:21"):pd.Timestamp("2025-01-18 11:34:21") + pd.Timedelta(hours = 1)])

[<matplotlib.lines.Line2D at 0x138a5ba70>,
 <matplotlib.lines.Line2D at 0x12df75bb0>,
 <matplotlib.lines.Line2D at 0x14cb63290>]

In [8]:
acc_rooti.index = t_acc_rooti.index #+ pd.Timedelta("1h")

In [38]:
from utils.compute_acc_SMV import compute_acc_SMV

acc_rooti["acc_SMV"] = compute_acc_SMV(acc_rooti)
acc_gen["acc_SMV"] = compute_acc_SMV(acc_gen)

In [39]:
import seaborn as sns
sns.set_context("talk")

In [48]:
plt.figure(figsize = (19,11))
plt.plot(acc_gen["acc_SMV"].iloc[:500000], label = "GENEActiv", linewidth = 0.91)
plt.plot(acc_rooti["acc_SMV"].iloc[:500000], label = "RootiRx", linewidth = 0.91)
# plt.legend()

[<matplotlib.lines.Line2D at 0x13e3a8b00>]

## Script that check for cross-correlations between different sensors, and adjust the start time accordingly

Assumption: the rotations are done within the first hour of recording

1. Resample the first hour of the signals to the same number of samples (for geneactiv 100 Hz)
2. Segment signal in 20 second windows with an overlap of 10s (the rotations last 10 seconds)
3. Compute the CCF for each of the windows 
4. Find the maximum and plot?

In [22]:
# 1. Resample the overall Rooti signal to the same number of samples (for geneactiv 100 Hz)

from utils.resample_signal import apply_resample

fs_res = 100

t_acc_rooti_resampled, acc_rooti_resampled = apply_resample(time = acc_rooti.index.astype(np.int64).to_numpy(), time_rs = acc_gen.index.astype(np.int64).to_numpy(), 
                                     data = acc_rooti["acc_SMV"].values)

acc_rooti_resampled = pd.Series(acc_rooti_resampled[0], index = pd.to_datetime(t_acc_rooti_resampled))

In [41]:
# 1. Resample the first hour of the signals to the same number of samples (for geneactiv 100 Hz)

from utils.resample_signal import apply_resample

fs_res = 100

acc_rooti_first_hour = acc_rooti.loc[:t_start_rooti + pd.Timedelta("2h")]["acc_SMV"]
acc_gen_first_hour = acc_gen.loc[:acc_gen.index[0] + pd.Timedelta("1h")]["acc_SMV"]

t_acc_rooti_resampled, acc_rooti_resampled = apply_resample(time = acc_rooti_first_hour.index.astype(np.int64).to_numpy(), time_rs = acc_gen_first_hour.index.astype(np.int64).to_numpy(), 
                                     data = acc_rooti_first_hour.values)

acc_rooti_resampled = pd.Series(acc_rooti_resampled[0], index = pd.to_datetime(t_acc_rooti_resampled))

In [None]:
# 2. Segment the signals into 20s windows with 50% overlap

from utils.segment_signal import segment_signal

acc_gen_segments = segment_signal(acc_gen_first_hour, window_size = 20*fs_res, overlap = 0.5)
# acc_rooti_segments = segment_signal(acc_rooti_resampled, window_size = 20*fs_res, overlap = 0.5)

In [43]:
# 3. Compute the crosscorrelation between the two signals for each window
from utils.crosscorr import crosscorr

offsets = []
max_corr = []
lags = np.arange(-400, 401, 1) # 400 samples = 4s, resolution of 1 samples = 0.01s
for i, (gen, rooti) in enumerate(zip(acc_gen_segments, acc_rooti_segments)):
    if len(gen) != len(rooti):
        continue
    ccf = [crosscorr(gen, rooti, lag) for lag in lags]
    max_corr.append(np.max(ccf))
    offset = np.argmax(ccf) - lags[-1]
    offset_s = offset / fs_res
    offsets.append(offset_s)

In [44]:
plt.figure(figsize = (11,5))
plt.subplot(2, 1, 1)
plt.plot(max_corr, '-*', label = "Max correlation")
plt.xlabel("Window")
plt.ylabel("Correlation")
plt.legend()
plt.subplot(2, 1, 2, sharex = plt.subplot(2, 1, 1))
plt.plot(offsets, '-*', label = "Offset (s)")
plt.xlabel("Window")
plt.ylabel("Offset (s)")
plt.legend()

<matplotlib.legend.Legend at 0x13daacaa0>

In [46]:
# Find the maximum correlation and the corresponding offset
max_corr = np.array(max_corr)
offsets = np.array(offsets)
max_corr_idx = np.argmax(max_corr)
max_corr_val = max_corr[max_corr_idx]

# Apply the offset to the RootiRx signal
offset = offsets[max_corr_idx]

acc_rooti_synched = acc_rooti.copy()
acc_rooti_synched.index = acc_rooti_synched.index - pd.Timedelta(f"{offset}s")
# Plot the signals
plt.figure(figsize = (19,11))
plt.plot(acc_gen["acc_SMV"].iloc[:500000], label = "GENEActiv ACC SMV")
plt.plot(acc_rooti_synched["acc_SMV"].iloc[:500000], label = "RootiRx ACC SMV")
plt.ylabel("Acceleration (g)")
plt.legend(loc = "upper left")

<matplotlib.legend.Legend at 0x13dc872f0>

## Do it for all GENEActiv

In [12]:
geneactiv.keys()

dict_keys(['wrist', 'ankle'])

In [14]:
from utils.compute_acc_SMV import compute_acc_SMV

# Compute SMV for the GENEActiv signals
geneactiv["wrist"]["acc_SMV"] = compute_acc_SMV(geneactiv["wrist"])
geneactiv["ankle"]["acc_SMV"] = compute_acc_SMV(geneactiv["ankle"])

In [32]:
acc_gen_wirst_first_hour = geneactiv["wrist"].loc[:geneactiv["wrist"].index[0] + pd.Timedelta("5min")]["acc_SMV"]
acc_gen_ankle_first_hour = geneactiv["ankle"].loc[:geneactiv["ankle"].index[0] + pd.Timedelta("5min")]["acc_SMV"]

# 1. Resample the first hour of the signals to the same number of samples (for geneactiv 100 Hz)
fs_res = 100

from utils.resample_signal import apply_resample

t_acc_gen_ankle_resampled, acc_gen_ankle_resampled = apply_resample(time = acc_gen_ankle_first_hour.index.astype(np.int64).to_numpy(), time_rs = acc_gen_wirst_first_hour.index.astype(np.int64).to_numpy(), 
                                     data = acc_gen_ankle_first_hour.values)

acc_gen_ankle_resampled = pd.Series(acc_gen_ankle_resampled[0], index = pd.to_datetime(t_acc_gen_ankle_resampled))

# 2. Segment the signals into 20s windows with 50% overlap

from utils.segment_signal import segment_signal

acc_gen_wrist_segments = segment_signal(acc_gen_wirst_first_hour, window_size = 20*fs_res, overlap = 0.5)
acc_gen_ankle_segments = segment_signal(acc_gen_ankle_resampled, window_size = 20*fs_res, overlap = 0.5)

In [33]:
acc_gen_wirst_first_hour

2025-01-17 11:38:44.000000000    0.987872
2025-01-17 11:38:44.009999990    1.025177
2025-01-17 11:38:44.019999981    1.045466
2025-01-17 11:38:44.029999971    1.076971
2025-01-17 11:38:44.039999962    1.043428
                                   ...   
2025-01-17 11:43:43.960000038    1.005571
2025-01-17 11:43:43.970000029    1.005720
2025-01-17 11:43:43.980000019    1.012752
2025-01-17 11:43:43.990000010    1.011449
2025-01-17 11:43:44.000000000    1.015252
Name: acc_SMV, Length: 30001, dtype: float64

In [35]:
# 3. Compute the crosscorrelation between the two signals for each window

from utils.crosscorr import crosscorr

offsets = []
max_corr = []
lags = np.arange(-400, 401, 1) # 400 samples = 4s, resolution of 1 samples = 0.01s
for i, (wrist, ankle) in enumerate(zip(acc_gen_wrist_segments, acc_gen_ankle_segments)):
    if len(wrist) != len(ankle):
        continue
    ccf = [crosscorr(wrist, ankle, lag) for lag in lags]
    max_corr.append(np.max(ccf))
    offset = np.argmax(ccf) - lags[-1]
    offset_s = offset / fs_res
    offsets.append(offset_s)

plt.figure(figsize = (11,5))
plt.subplot(2, 1, 1)
plt.plot(max_corr, '-*', label = "Max correlation")
plt.xlabel("Window")
plt.ylabel("Correlation")
plt.legend()
plt.subplot(2, 1, 2, sharex = plt.subplot(2, 1, 1))
plt.plot(offsets, '-*', label = "Offset (s)")
plt.xlabel("Window")
plt.ylabel("Offset (s)")
plt.legend()

<matplotlib.legend.Legend at 0x3c7dac740>

In [None]:
# Find the maximum correlation and the corresponding offset
max_corr = np.array(max_corr)
offsets = np.array(offsets)
max_corr_idx = np.argmax(max_corr)
max_corr_val = max_corr[max_corr_idx]
offset = offsets[max_corr_idx]

# Apply the offset to the GENEActiv ankle signal
acc_gen_ankle_synched = geneactiv["ankle"].copy()
acc_gen_ankle_synched.index = acc_gen_ankle_synched.index - pd.Timedelta(f"{offset}s")

# Plot the signals
plt.figure(figsize = (19,11))
plt.plot(geneactiv["wrist"]["acc_SMV"].iloc[:500000], label = "GENEActiv wrist ACC SMV")
plt.plot(acc_gen_ankle_synched["acc_SMV"].iloc[:500000], label = "GENEActiv ankle ACC SMV")
plt.ylabel("Acceleration (g)")
plt.legend(loc = "upper left")

<matplotlib.legend.Legend at 0x38d5156a0>

In [37]:
geneactiv["wrist"].head()

Unnamed: 0,x,y,z,acc_SMV
2025-01-17 11:38:44.000000000,-0.14322,-0.975911,0.054555,0.987872
2025-01-17 11:38:44.009999990,-0.131305,-1.015662,0.046677,1.025177
2025-01-17 11:38:44.019999981,-0.131305,-1.035537,0.058495,1.045466
2025-01-17 11:38:44.029999971,-0.127333,-1.063362,0.113641,1.076971
2025-01-17 11:38:44.039999962,-0.11939,-1.031562,0.101824,1.043428


In [None]:
from nonwear.DETACH import nimbaldetach

# Apply the non-wear detection algorithm to the GENEActiv wrist signal
start_stop_nw, _ = nimbaldetach(acc['x'].values, acc['y'].values, acc['z'].values, temp["temp"].values, accel_freq=64, temperature_freq=1, quiet=True)

# Create the structure in the silver layer

In [None]:
silver_layer_path = "/Users/augenpro/Documents/Age-IT/data/Silver/"

# Create the same folder structure for the Silver data
data_path = "/Users/augenpro/Documents/Age-IT/data/Gold/" # path to the folder containing the subjects

participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
timeline = ["T0 (baseline)", "T1 (6 mesi)", "T2 (12 mesi)"]

for participant in participants:
    for visit in timeline:
        path = os.path.join(silver_layer_path, participant, visit)
        os.makedirs(path, exist_ok=True)
        os.makedirs(os.path.join(path, "GeneActivPolso"), exist_ok=True)
        os.makedirs(os.path.join(path, "GeneActivCaviglia"), exist_ok=True)
        os.makedirs(os.path.join(path, "RootiRx"), exist_ok=True)
        os.makedirs(os.path.join(path, "VeritySense"), exist_ok=True)
        os.makedirs(os.path.join(path, "Diario"), exist_ok=True)

# Process GGIR output

In [None]:
silver_layer_path = "/Users/augenpro/Documents/Age-IT/data/Silver/"
bronze_layer_path = "/Users/augenpro/Documents/Age-IT/data/Bronze/"

participants = sorted([p for p in os.listdir(silver_layer_path) if not p.startswith(".")]) # list of the participants
timeline = ["T0 (baseline)", "T1 (6 mesi)", "T2 (12 mesi)"]
visit = "T0 (baseline)"
sensors = ["GeneActivPolso"]#, "GeneActivCaviglia"]
sensor = sensors[0]

participants = ["08623"]

GGIR_output_dir = "output_icareit"

for participant in participants:

    ###### Load the GENEActiv data (for debugging purposes only) ######
    # files = os.listdir(os.path.join(bronze_layer_path, participant, visit, sensor))
    # for f in files:
    #     if f.endswith(".parquet"):
    #         acc_gen = pd.read_parquet(os.path.join(bronze_layer_path, participant, visit, sensor, f))

    ###### Load the GGIR output data ######
    HDCZA_SPT = [] # start and end of SPT based on HDCZA (vanhees2018) 
    ggir_output_path = os.path.join(silver_layer_path, participant, visit, sensor, GGIR_output_dir)
    if not os.path.exists(ggir_output_path): # Skip participants with no Data
        continue
    ggir_part4_output = pd.read_csv(ggir_output_path + "/results/QC/part4_nightsummary_sleep_full.csv")
    for i, day_row in ggir_part4_output.iterrows():
        # Stupid thing to get the correct datetime for segmenting signals into day and night (but no alternatives I guess)
        if day_row["sleeponset_ts"][0] == "0": # if the first digit is after midnight (00, 01, 02, ...)
            sleep_onset = pd.to_datetime(str(pd.to_datetime(day_row["calendar_date"]).date() + pd.Timedelta("1d")) + " " + day_row["sleeponset_ts"])
        else:
            sleep_onset = pd.to_datetime(str(pd.to_datetime(day_row["calendar_date"]).date()) + " " + day_row["sleeponset_ts"])
        wake_onset = pd.to_datetime(str(pd.to_datetime(day_row["calendar_date"]).date() + pd.Timedelta("1d")) + " " + day_row["wakeup_ts"])
        day = pd.to_datetime(day_row["calendar_date"]).date()
        HDCZA_SPT.append((sleep_onset, wake_onset, day))
    HDCZA_SPT = pd.DataFrame(HDCZA_SPT, columns = ["sleep_onset_HDCZA", "wake_onset_HDCZA", "calendar_day"])

    ###### Load the diary data ######
    diary_raw_path = os.path.join(bronze_layer_path, participant, visit, "Diario", f"{participant}_{visit.split(" ")[0]}_Diario.xlsx")
    if not os.path.exists(diary_raw_path): # Skip participants with no Data
        continue
    diary_raw = pd.read_excel(diary_raw_path, sheet_name="Ore")
    diary_raw = diary_raw[(diary_raw["Evento"] == "1) A letto") | (diary_raw["Evento"] == "2) Risveglio finale")].reset_index(drop = True)
    diary_raw["Data"] = diary_raw["Data"].apply(lambda x: str(x).split(" ")[0])
    diary_raw["Ora inizio"] = diary_raw["Ora inizio"].apply(lambda x: str(x))
    sleep_onset_diary = pd.to_datetime(diary_raw[diary_raw["Evento"] == "1) A letto"]["Data"] + " " + diary_raw[diary_raw["Evento"] == "1) A letto"]["Ora inizio"]).dropna().reset_index(drop = True)
    wake_onset_diary = pd.to_datetime(diary_raw[diary_raw["Evento"] == "2) Risveglio finale"]["Data"] + " " + diary_raw[diary_raw["Evento"] == "2) Risveglio finale"]["Ora inizio"]).dropna().reset_index(drop = True)
    diary_SPT = pd.DataFrame({"sleep_onset_diary": sleep_onset_diary, "wake_onset_diary": wake_onset_diary})
    diary_SPT["calendar_day"] = diary_SPT["sleep_onset_diary"].apply(lambda x: (x - pd.Timedelta(days=1)).date() if x.hour < 12 else x.date())
    
    # Fill the missing diary data
    for i, row in HDCZA_SPT.iterrows():
        if row["calendar_day"] not in diary_SPT["calendar_day"].values:
            diary_SPT = pd.concat([diary_SPT, pd.DataFrame({"sleep_onset_diary": [np.nan], "wake_onset_diary": [np.nan], "calendar_day": [row["calendar_day"]]})], ignore_index = True)
    diary_SPT = diary_SPT.sort_values(by = "calendar_day").reset_index(drop = True)
    HDCZA_SPT = HDCZA_SPT.sort_values(by = "calendar_day").reset_index(drop = True)

diary_SPT

  diary_SPT = pd.concat([diary_SPT, pd.DataFrame({"sleep_onset_diary": [np.nan], "wake_onset_diary": [np.nan], "calendar_day": [row["calendar_day"]]})], ignore_index = True)


Unnamed: 0,sleep_onset_diary,wake_onset_diary,calendar_day
0,2025-01-28 22:30:00,2025-01-29 07:00:00,2025-01-28
1,2025-01-29 23:45:00,2025-01-30 07:30:00,2025-01-29
2,2025-01-31 00:00:00,2025-01-31 07:50:00,2025-01-30
3,2025-01-31 23:30:00,2025-02-01 07:00:00,2025-01-31
4,NaT,NaT,2025-02-01
5,NaT,NaT,2025-02-02
6,NaT,NaT,2025-02-03


In [50]:
# Check the overlap between the HDCZA and diary SPT
overlap = []
for i, row in HDCZA_SPT.iterrows():
    if row["calendar_day"] in diary_SPT["calendar_day"].values:
        diary_row = diary_SPT[diary_SPT["calendar_day"] == row["calendar_day"]].iloc[0]
        overlap.append((row["sleep_onset_HDCZA"], row["wake_onset_HDCZA"], diary_row["sleep_onset_diary"], diary_row["wake_onset_diary"], row["calendar_day"]))
SPT_HDCZA_and_diary = pd.DataFrame(overlap, columns = ["sleep_onset_HDCZA", "wake_onset_HDCZA", "sleep_onset_diary", "wake_onset_diary", "calendar_day"])
SPT_HDCZA_and_diary["diff_sleep_onset"] = (SPT_HDCZA_and_diary["sleep_onset_HDCZA"] - SPT_HDCZA_and_diary["sleep_onset_diary"]).dt.total_seconds() / 60
SPT_HDCZA_and_diary["diff_wake_onset"] = (SPT_HDCZA_and_diary["wake_onset_HDCZA"] - SPT_HDCZA_and_diary["wake_onset_diary"]).dt.total_seconds() / 60

# Flag if the difference between the HDCZA and diary SPT is more than 30 minutes
SPT_HDCZA_and_diary["flag_sleep_onset"] = np.where(np.abs(SPT_HDCZA_and_diary["diff_sleep_onset"]) > 30, 1, 0)
SPT_HDCZA_and_diary["flag_wake_onset"] = np.where(np.abs(SPT_HDCZA_and_diary["diff_wake_onset"]) > 30, 1, 0)

# Plot the discrepancies between the HDCZA and diary SPT
import seaborn as sns
sns.set_context("talk")
plt.figure(figsize = (19,11))
sns.histplot(SPT_HDCZA_and_diary["diff_sleep_onset"]/60, bins = 50, color = "blue", kde = True, label = "Sleep onset")


<Axes: xlabel='diff_sleep_onset', ylabel='Count'>

In [51]:
SPT_HDCZA_and_diary["diff_sleep_onset"]

0   -124.333333
1     24.333333
2     30.000000
3   -250.333333
4           NaN
5           NaN
6           NaN
Name: diff_sleep_onset, dtype: float64

In [32]:
HDCZA_SPT.to_csv("/Users/augenpro/Documents/data4gpt/HDCZA_SPT.csv")

In [31]:
diary_SPT.to_csv("/Users/augenpro/Documents/data4gpt/diary_SPT.csv")

In [42]:
HDCZA_SPT["sleep_onset_HDCZA"]

0   2025-01-28 20:25:40
1   2025-01-29 00:09:20
2   2025-01-30 00:30:00
3   2025-01-31 19:19:40
4   2025-02-01 19:30:45
5   2025-02-02 21:36:25
6   2025-02-03 20:51:00
Name: sleep_onset_HDCZA, dtype: datetime64[ns]

0    2025-01-28
1    2025-01-29
2    2025-01-30
3    2025-01-31
Name: calendar_day, dtype: object

In [27]:
plt.figure(figsize=(19,11))
plt.plot(acc_gen[["x", "y", "z"]].resample("1min").mean(), label = "GeneActiv")

[<matplotlib.lines.Line2D at 0x12147aff0>,
 <matplotlib.lines.Line2D at 0x121415dc0>,
 <matplotlib.lines.Line2D at 0x1217176b0>]

## Tidy Sleep Diary

Look for discrepancies > 30 min.

In [23]:
# 1. Look for those nights not reported in the diary

HDCZA_SPT["date"] = HDCZA_SPT["sleep_onset_HDCZA"].dt.date
diary_SPT["date"] = diary_SPT["sleep_onset_diary"].dt.date

missing_nights = HDCZA_SPT[~HDCZA_SPT["date"].isin(diary_SPT["date"])]

missing_nights

Unnamed: 0,sleep_onset_HDCZA,wake_onset_HDCZA,date
2,2025-01-30 00:30:00,2025-01-31 10:26:00,2025-01-30
4,2025-02-01 19:30:45,2025-02-02 06:53:40,2025-02-01
5,2025-02-02 21:36:25,2025-02-03 07:09:50,2025-02-02
6,2025-02-03 20:51:00,2025-02-04 07:06:10,2025-02-03


Unnamed: 0,sleep_onset_HDCZA,wake_onset_HDCZA,sleep_onset_diary,wake_onset_diary,sleep_diff,wake_diff
0,2025-01-28 20:25:40,2025-01-29 07:00:00,2025-01-28 22:30:00,2025-01-29 07:00:00,-124.333333,0.0
1,2025-01-29 00:09:20,2025-01-30 07:27:40,2025-01-29 23:45:00,2025-01-30 07:30:00,-1415.666667,-2.333333
2,2025-01-30 00:30:00,2025-01-31 10:26:00,2025-01-31 00:00:00,2025-01-31 07:50:00,-1410.0,156.0
3,2025-01-31 19:19:40,2025-02-01 06:16:15,2025-01-31 23:30:00,2025-02-01 07:00:00,-250.333333,-43.75
4,2025-02-01 19:30:45,2025-02-02 06:53:40,NaT,NaT,,
5,2025-02-02 21:36:25,2025-02-03 07:09:50,NaT,NaT,,
6,2025-02-03 20:51:00,2025-02-04 07:06:10,NaT,NaT,,
