# AGE-it - Exploratory Data Analysis

- Summary of the recordings - how many hours/days for each sensor?

### Import libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import glob

%matplotlib inline
#%matplotlib qt
mpl.rcParams['lines.linewidth'] = 0.91
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib qt

from avro.datafile import DataFileReader
from avro.io import DatumReader

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Slider, Select
from bokeh.layouts import gridplot
from bokeh.models import Range1d
from bokeh.io import export_png
from bokeh.models import DatetimeTickFormatter

PermissionError: [Errno 1] Operation not permitted

In [4]:
import sys
import os

# Add the project root directory to Python's search path
sys.path.append(os.path.abspath(".."))

## GENEActiv
Files are already converted into .bin with the bin2parquet notebook

In [67]:
data_path = "/Users/augenpro/Documents/Age-IT/data/" # path to the folder containing the subjects
participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
visit = "T0 (baseline)" # T0 (baseline), T1 (follow-up @ 6 months), T2 (follow-up @ 12 months)

# participants = participants[5:6] # select the first participant

sensors = ["GeneActivPolso", "GeneActivCaviglia"]#, "RootiRx"]

geneactiv = {}

for participant in participants:
    print(participant)
    for sensor in sensors:
        # print(sensor)
        # Load the data
        path = os.path.join(data_path, participant, visit, sensor)
        files = os.listdir(os.path.join(data_path, participant, visit, sensor))
        # if there are already parquet files, skip
        for f in files:
            if f.endswith(".parquet"):
                # print(f)
                acc_df = pd.read_parquet(os.path.join(path, f))
                # geneactiv[sensor] = acc_df
                if len(acc_df) == 0:
                    print(f"{sensor}: is empty")
                else:
                    print(f"{sensor}:  {acc_df.index[-1] - acc_df.index[0]}".split(".")[0])

    print("")
# # Plot the data
# p = figure(plot_width=800, plot_height=400, x_axis_type="datetime", title=f"{participant} - {sensor} - {visit}")
# p.line(data["timestamp"], data["x"], line_width=2, legend_label="x", color="blue")
# p.line(data["timestamp"], data["y"], line_width=2, legend_label="y", color="green")
# p.line(data["timestamp"], data["z"], line_width=2, legend_label="z", color="red")
# p.xaxis.formatter = DatetimeTickFormatter(days="%d/%m %H:%M", hours="%H:%M")
# p.legend.location = "top_left"
# p.legend.click_policy="hide"
# show(p)
# export_png(p, filename=f"{participant}_{sensor}_{visit}.png")
# print(f"{participant}_{sensor}_{visit}.png")
# print("")

08623
GeneActivPolso:  7 days 01:20:20
GeneActivCaviglia:  7 days 01:19:44

08667

14219
GeneActivPolso:  6 days 22:27:29
GeneActivCaviglia:  6 days 22:27:26

20603

23483
GeneActivPolso:  6 days 23:38:14
GeneActivCaviglia:  6 days 23:38:20

36644
GeneActivPolso:  6 days 22:46:32
GeneActivCaviglia:  6 days 22:47:14

36765

36920
GeneActivPolso:  5 days 23:55:05
GeneActivCaviglia:  5 days 23:54:05

58319
GeneActivPolso:  0 days 18:23:38
GeneActivCaviglia:  7 days 01:19:05

59794
GeneActivPolso:  6 days 00:47:26
GeneActivCaviglia:  6 days 00:32:56

65381
GeneActivPolso:  7 days 01:47:23
GeneActivCaviglia: is empty

68503
GeneActivPolso:  6 days 01:14:53
GeneActivCaviglia:  6 days 01:15:05

73496
GeneActivPolso:  6 days 22:59:50
GeneActivCaviglia:  6 days 22:59:38

74003
GeneActivPolso: is empty
GeneActivCaviglia:  6 days 23:37:05

74913
GeneActivPolso:  7 days 01:57:35
GeneActivCaviglia:  7 days 01:57:35

78936
GeneActivPolso:  6 days 00:23:11
GeneActivCaviglia:  6 days 00:22:47

86693
G

In [60]:
# Recording duration
for sensor, data in geneactiv.items():
    print(f"{sensor}: {data.index[-1] - data.index[0]}".split(".")[0])

GeneActivPolso: 6 days 22:46:32
GeneActivCaviglia: 6 days 22:47:14


In [62]:
# geneactiv["GeneActivPolso"]["acc_SMV"] = np.sqrt(geneactiv["GeneActivPolso"]["x"]**2 + geneactiv["GeneActivPolso"]["y"]**2 + geneactiv["GeneActivPolso"]["z"]**2)
# geneactiv["GeneActivCaviglia"]["acc_SMV"] = np.sqrt(geneactiv["GeneActivCaviglia"]["x"]**2 + geneactiv["GeneActivCaviglia"]["y"]**2 + geneactiv["GeneActivCaviglia"]["z"]**2)

plt.figure(figsize = (19,11))
plt.subplot(2, 1, 1)
plt.plot(geneactiv["GeneActivPolso"]["acc_SMV"].iloc[:1000000], label = "GeneActivPolso")
plt.plot(geneactiv["GeneActivCaviglia"]["acc_SMV"].iloc[:1000000], label = "GeneActivCaviglia")
plt.title("Recording Start", fontsize = 21)
plt.legend()
plt.subplot(2, 1, 2)
plt.plot(geneactiv["GeneActivPolso"]["acc_SMV"].iloc[-1000000:], label = "GeneActivPolso")
plt.plot(geneactiv["GeneActivCaviglia"]["acc_SMV"].iloc[-1000000:], label = "GeneActivCaviglia")
plt.title("Recording End", fontsize = 21)
plt.legend()

<matplotlib.legend.Legend at 0x163814170>

## Verity Sense

In [4]:
from data_io.veritysense.convert_polar import process_polar

In [8]:
import fastavro

def read_avro_veritysense(file_path, offset_vs = 946684800000000000+ 3600 * 1e9):
    """
    Reads an Avro file and returns the data as a pandas DataFrame.
    """
    with open(file_path, "rb") as avro_file:
        # Use fastavro.reader to read the Avro file
        reader = fastavro.reader(avro_file)
        # Convert the records to a list
        records = [record for record in reader]
       # to dataframe
        df = pd.DataFrame(records)
        df.index = pd.to_datetime(df["timestamp"] + offset_vs, unit="ns") 
        df.drop("timestamp", axis=1, inplace=True)
    return df / 1000

In [19]:
import re
from datetime import datetime

# Function to extract full datetime (including time) for sorting
def extract_datetime(filename):
    match = re.search(r'(\w{3}) (\w{3}) (\d{2}) (\d{2})-(\d{2})-(\d{2}) .* (\d{4})', filename)
    if match:
        weekday, month, day, hour, minute, second, year = match.groups()
        date_str = f"{day} {month} {year} {hour}:{minute}:{second}"
        date_obj = datetime.strptime(date_str, "%d %b %Y %H:%M:%S")  # Convert to datetime object
        return date_obj
    return datetime.max  # Default to a max value if parsing fails

In [36]:
data_path = "/Users/augenpro/Documents/Age-IT/data" # path to the folder containing the subjects
participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
visit = "T0 (baseline)" # T0 (baseline), T1 (follow-up @ 6 months), T2 (follow-up @ 12 months)

# participants = participants[0:1] # select the first participant

participants = ["23483"]

sensor = "VeritySense"#, "GeneActivPolso", "GeneActivCaviglia", "RootiRx"]

for participant in participants:
    acc_df = pd.DataFrame()
    ppg_df = pd.DataFrame()
    # print(sensor)
    path = os.path.join(data_path, participant, visit, sensor)
    files_in_path = [f for f in os.listdir(path) if not f.startswith(".")]
    if len(files_in_path) <= 2: # "_" and "AVRO"
        continue

    print(f"**************** {participant} ****************")
    acc_path = os.path.join(path, "AVRO/acc")
    ppg_path = os.path.join(path, "AVRO/ppg")
    for f in sorted(os.listdir(acc_path), key=extract_datetime):
        current_acc = read_avro_veritysense(os.path.join(acc_path, f))
        acc_df = pd.concat([acc_df, current_acc])

    for f in sorted(os.listdir(ppg_path), key=extract_datetime):
        current_ppg = read_avro_veritysense(os.path.join(ppg_path, f))
        ppg_df = pd.concat([ppg_df, current_ppg])

    t_acc_rec = acc_df.index[acc_df.index.to_series().diff().dt.total_seconds() > 0.5]
    t_acc_disc = acc_df.index[np.where(acc_df.index.to_series().diff().dt.total_seconds() > 0.5)[0]-1]
    t_disc_rec_acc = pd.DataFrame({"disc": t_acc_disc, "rec": t_acc_rec})
    total_duration_acc = acc_df.index[-1] - acc_df.index[0]
    disconnetions_duration_acc = t_disc_rec_acc["rec"] - t_disc_rec_acc["disc"]
    effective_duration_acc = total_duration_acc - disconnetions_duration_acc.sum()
    if len(t_acc_disc) > 0:
        print("\n ***** Disconnections in ACC *****")
        # for i in range(len(t_acc_disc)):
        #     print(f"Disconnection {i+1}:")
        #     print(f"Start: {t_acc_disc[i].strftime('%Y-%m-%d %H:%M:%S')}")
        #     print(f"End: {t_acc_rec[i].strftime('%Y-%m-%d %H:%M:%S')}")
        #     print(f"Duration: {str(t_acc_rec[i] - t_acc_disc[i]).split(' ')[2][3:5]} min and {str(t_acc_rec[i] - t_acc_disc[i]).split(' ')[2][6:8]} s")
        print("Number of disconnections: " + str(len(t_acc_disc)))
        print("Total duration of disconnections: " + str(np.sum([t_acc_rec[i] - t_acc_disc[i] for i in range(len(t_acc_disc))])).split(' ')[2][:8] + " (hours, minutes and seconds)")
        print(f"ACC effective duration: {effective_duration_acc}".split(".")[0])
    # Fill disconnection portions with NaNs and linearly interpolate
    for start, end in zip(t_acc_disc, t_acc_rec):
        acc_df.loc[start:end] = np.nan
    acc_df = acc_df.interpolate(method='time')

    t_ppg_rec = ppg_df.index[ppg_df.index.to_series().diff().dt.total_seconds() > 1]
    t_ppg_disc = ppg_df.index[np.where(ppg_df.index.to_series().diff().dt.total_seconds() > 1)[0]-1]
    t_disc_rec_ppg = pd.DataFrame({"disc": t_ppg_disc, "rec": t_ppg_rec})
    total_duration_ppg = ppg_df.index[-1] - ppg_df.index[0]
    disconnetions_duration_ppg = t_disc_rec_ppg["rec"] - t_disc_rec_ppg["disc"]
    effective_duration_ppg = total_duration_ppg - disconnetions_duration_ppg.sum()
    if len(t_ppg_disc) > 0:
        print("\n ***** Disconnections in PPG *****")
        # for i in range(len(t_ppg_disc)):
        #     print(f"Disconnection {i+1}:")
        #     print(f"Start: {t_ppg_disc[i].strftime('%Y-%m-%d %H:%M:%S')}")
        #     print(f"End: {t_ppg_rec[i].strftime('%Y-%m-%d %H:%M:%S')}")
        #     print(f"Duration: {str(t_ppg_rec[i] - t_ppg_disc[i]).split(' ')[2][3:5]} min and {str(t_ppg_rec[i] - t_ppg_disc[i]).split(' ')[2][6:8]} s")
        print("Number of disconnections: " + str(len(t_ppg_disc)))
        print("Total duration of disconnections: " + str(np.sum([t_ppg_rec[i] - t_ppg_disc[i] for i in range(len(t_ppg_disc))])).split(' ')[2][:8] + " (hours, minutes and seconds)")
        print(f"PPG effective duration: {effective_duration_ppg}".split(".")[0])
        print("")
    # Fill disconnection portions with NaNs and linearly interpolate
    for start, end in zip(t_ppg_disc, t_ppg_rec):
        ppg_df.loc[start:end] = np.nan
    ppg_df = ppg_df.interpolate(method='time')

**************** 23483 ****************

 ***** Disconnections in ACC *****
Number of disconnections: 134
Total duration of disconnections: 06:41:26 (hours, minutes and seconds)
ACC effective duration: 4 days 16:50:17

 ***** Disconnections in PPG *****
Number of disconnections: 137
Total duration of disconnections: 06:11:44 (hours, minutes and seconds)
PPG effective duration: 4 days 17:19:59



##### Soggetto 97060 ------- weird timestamps starting at 1970

In [37]:
total_dur = acc_df.index[-1] - acc_df.index[0]
disconnetions_dur = t_disc_rec_acc["rec"] - t_disc_rec_acc["disc"]
effective_dur = total_dur - disconnetions_dur.sum()
total_dur, disconnetions_dur.sum(), effective_dur

(Timedelta('6 days 23:31:43.876258560'),
 Timedelta('2 days 06:41:26.693904128'),
 Timedelta('4 days 16:50:17.182354432'))

In [41]:
for start, end in zip(t_acc_disc, t_acc_rec):
    print(start,end)
    break

ppg_df.loc[start:end]

2025-01-21 16:17:29.975070208 2025-01-21 16:20:03.052938240


Unnamed: 0_level_0,ppg1,ppg2,ppg3,ambient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [48]:
ppg_df.loc[pd.Timestamp("2025-01-21 16:20:03.052938240"):pd.Timestamp("2025-01-22 16:20:03.052938240")].to_csv("ppg_df.csv")

In [46]:
ppg = pd.read_pickle("ppg_df.pkl")
ppg

Unnamed: 0_level_0,ppg1,ppg2,ppg3,ambient
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-01-21 16:20:03.344035328,115.751091,19.832478,75.259832,-291.321995
2025-01-21 16:20:03.362216960,115.752000,19.834000,75.260000,-291.322000
2025-01-21 16:20:03.380399104,115.551000,19.657000,75.209000,-291.231000
2025-01-21 16:20:03.398580736,115.589000,19.481000,75.181000,-291.230000
2025-01-21 16:20:03.416762624,115.991000,19.511000,75.189000,-291.263000
...,...,...,...,...
2025-01-22 16:20:02.974105088,63.964000,22.219000,25.433000,-291.274000
2025-01-22 16:20:02.992358400,65.248000,23.506000,26.564000,-291.263000
2025-01-22 16:20:03.010611712,66.445000,24.733000,27.541000,-291.279000
2025-01-22 16:20:03.028865024,67.564000,25.856000,28.451000,-291.254000


In [33]:
import neurokit2 as nk
ppg_df["ppg_filt"] = -nk.signal_filter(ppg_df["ppg1"], lowcut=0.5, highcut=8, sampling_rate=55)

In [38]:
plt.figure(figsize = (19,11))
plt.subplot(2, 1, 1)
plt.plot(acc_df)
plt.title("Acceleration")
# plt.legend(acc_df.columns)
plt.subplot(2, 1, 2, sharex = plt.subplot(2, 1, 1))
plt.plot(ppg_df["ppg1"])
plt.title("PPG")
# plt.legend(["ppg_filt"])

Text(0.5, 1.0, 'PPG')

In [95]:
# Calculate the actual recording duration by summing up the differences between consecutive timestamps
time_diffs = acc_df.index.to_series().diff().dropna()
overall_duration = time_diffs.sum()
print(f"Overall duration of the recording (excluding gaps): {overall_duration}")

Overall duration of the recording (excluding gaps): 0 days 05:29:49.749643520


In [98]:
ppg_df.index[-1] - ppg_df.index[0]

Timedelta('0 days 05:29:53.340454144')

## Rooti

In [33]:
data_path = "/Users/augenpro/Documents/Age-IT/data/" # path to the folder containing the subjects
participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
visit = "T0 (baseline)" # T0 (baseline), T1 (follow-up @ 6 months), T2 (follow-up @ 12 months)
# participants = ["14219"]
# participants = participants[5:6] # select the first participant

sensors = ["RootiRx"]#, "RootiRx"]

sensor = sensors[0]

rootirx = {}

for participant in participants:
    path = os.path.join(data_path, participant, visit, sensor)
    files = os.listdir(os.path.join(data_path, participant, visit, sensor))

    files = [f for f in files if f == participant]

    if len(files) == 0:
        # print("No RootiRx data")
        continue
    
    print("***** " + participant + " *****")
    rooti_path = os.path.join(data_path, participant, visit, sensor, files[0], "measure")

    raw_acc_path = [x[0] for x in os.walk(rooti_path) if x[0].endswith("/GSENSOR")][0]

    # print(raw_acc_path)

    acc_files = sorted(os.listdir(raw_acc_path))

    # print(len(acc_files))

    # acc = pd.DataFrame()

    acc_first = pd.read_csv(raw_acc_path + '/' + acc_files[0], compression='zip', index_col = 0, header = None, names = ["packet", "acc_x", "acc_y", "acc_z", "idk", "idk2"])
    acc_first_start = pd.to_datetime(acc_first.index[0], unit = "s")

    acc_last = pd.read_csv(raw_acc_path + '/' + acc_files[-1], compression='zip', index_col = 0, header = None, names = ["packet", "acc_x", "acc_y", "acc_z", "idk", "idk2"])
    acc_last_end = pd.to_datetime(acc_last.index[-1], unit = "s")

    print(f"Total duration: {acc_last_end - acc_first_start}")

    # for i, acc_file in enumerate(acc_files):
    #     acc1 = pd.read_csv(raw_acc_path + '/' + acc_file, compression='zip', index_col = 0, header = None, names = ["packet", "acc_x", "acc_y", "acc_z", "idk", "idk2"])
    #     acc1 = acc1[["acc_x", "acc_y", "acc_z"]]
    #     acc1.index = pd.date_range(start = pd.to_datetime(acc1.index[0], unit = "s"), periods = len(acc1), freq = "0.0032s")
    #     acc = pd.concat([acc, acc1])
    # break

    # print(path)

    print("")

    # print(files)

***** 14219 *****
Total duration: 6 days 23:49:20

***** 36644 *****
Total duration: 6 days 22:15:09

***** 59794 *****
Total duration: 5 days 23:36:26

***** 73496 *****
Total duration: 2 days 22:04:56

***** 74003 *****
Total duration: 6 days 16:23:18



In [31]:
acc_files

['1737458741.zip',
 '1737462341.zip',
 '1737465941.zip',
 '1737469541.zip',
 '1737473141.zip',
 '1737476741.zip',
 '1737480341.zip',
 '1737483941.zip',
 '1737487541.zip',
 '1737491141.zip',
 '1737494741.zip',
 '1737498341.zip',
 '1737501941.zip',
 '1737505541.zip',
 '1737509141.zip',
 '1737512741.zip',
 '1737516341.zip',
 '1737519941.zip',
 '1737523541.zip',
 '1737527141.zip',
 '1737530741.zip',
 '1737534341.zip',
 '1737537941.zip',
 '1737541541.zip',
 '1737545141.zip',
 '1737548741.zip',
 '1737552341.zip',
 '1737555941.zip',
 '1737559541.zip',
 '1737563141.zip',
 '1737566741.zip',
 '1737570341.zip',
 '1737573941.zip',
 '1737577541.zip',
 '1737581141.zip',
 '1737584741.zip',
 '1737588341.zip',
 '1737591941.zip',
 '1737595541.zip',
 '1737599141.zip',
 '1737602741.zip',
 '1737606341.zip',
 '1737609941.zip',
 '1737613541.zip',
 '1737617141.zip',
 '1737620741.zip',
 '1737624341.zip',
 '1737627941.zip',
 '1737631541.zip',
 '1737635141.zip',
 '1737638741.zip',
 '1737642341.zip',
 '1737645941

In [22]:
plt.figure(figsize = (19,11))
plt.plot(acc.values)

[<matplotlib.lines.Line2D at 0x15f8400e0>,
 <matplotlib.lines.Line2D at 0x15f842960>,
 <matplotlib.lines.Line2D at 0x15f842a80>]

In [16]:
acc1.head()

Unnamed: 0,packet,acc_x,acc_y,acc_z,idk,idk2
2025-01-17 10:34:21.000000,0,176,-191,1288,18,166
2025-01-17 10:34:21.003200,1,150,-183,1324,18,166
2025-01-17 10:34:21.006400,2,150,119,1187,18,166
2025-01-17 10:34:21.009600,3,115,-164,1379,18,166
2025-01-17 10:34:21.012800,4,60,-287,1468,18,166
