In [0]:
%pip install scikit-digital-health

In [0]:
import mlflow
mlflow.autolog(disable=True)

In [0]:
"""
Read .bin files coming from GENEActiv devices and save them as a pd.DataFrame in parquet format.

This module is based on the code from the scikit-digital-health python package (https://github.com/pfizer-opensource/scikit-digital-health)

IMP: scikit-digital-health create conflicts with versions needed for other packages. Scikit-digital-health  uses a lot of other modules (also implemented in .c) and it is not clear how to separate them without breaking the code.
Solution: the scikit-digital-health package is installed as a notebook-scoped library only for this notebook. 

"""

"""
IMP: Sometimes there is an unexpected behaviour. It is not clear why but the accelerometer data is not read correctly, and if it happens it happens for all the recordings of all participants. 
If the message "Not enough still periods found for calibration" is printed, it means that the accelerometer data is not read correctly.
It doesn't make any sense - just restarting the PC and running the same code again solves the problem.
"""

import numpy as np
import pandas as pd
import os
from skdh.io import ReadBin
from skdh.preprocessing import CalibrateAccelerometer

from nonwear.DETACH import nimbaldetach

def bin2parquet(file_path, save_path, calibrate = True):
    """
    Read a .bin file from a GENEActiv device.
    """

    reader = ReadBin()
    data = reader.predict(file = file_path)
    # data_for_df = np.concatenate((data["accel"], data["temperature"].reshape(-1, 1), data["light"].reshape(-1, 1)), axis = 1)
    # acc_df = pd.DataFrame(data_for_df, 
    #                       columns = ["x", "y", "z", "temperature", "light"],
    #                       index = pd.to_datetime(data["time"], unit = "s"))
    # # save as parquet
    # acc_df.to_parquet(save_path.replace(".bin", ".parquet"))

    if (data["accel"].shape[0] > 72*60*60*100) & calibrate: # if the data is longer than 72 hours
        calibrator = CalibrateAccelerometer()
        print("Calibrating accelerometer...")
        try:
            acc_cal = calibrator.predict(time = data["time"], accel = data["accel"], temperature = data["temperature"])
        except KeyError:
            print("Not enough data to calibrate accelerometer.")
            return
        
        start_stop_nw, _ = nimbaldetach(data["accel"][:, 0], data["accel"][:, 1], data["accel"][:, 2], data["temperature"],
                                         accel_freq=100, temperature_freq=100, quiet=True)

        data_for_df = np.concatenate((acc_cal["accel"], data["temperature"].reshape(-1, 1), data["light"].reshape(-1, 1)), axis = 1)
        data_cal_df = pd.DataFrame(data_for_df,
                                   columns = ["x", "y", "z", "temperature", "light"],
                                   index = pd.to_datetime(data["time"], unit = "s"))
        
        # Remove non-wear periods
        for i, row in start_stop_nw.iterrows():
            datetime_start_nw = data_cal_df.index[row["Start Datapoint"]]
            datetime_end_nw = data_cal_df.index[row["End Datapoint"]]
            data_cal_df.loc[datetime_start_nw:datetime_end_nw] = np.nan

        data_cal_df.to_parquet(save_path.replace(".bin", "_preprocessed.parquet"))

    else: # do not calibrate
        start_stop_nw, _ = nimbaldetach(data["accel"][:, 0], data["accel"][:, 1], data["accel"][:, 2], data["temperature"],
                                         accel_freq=100, temperature_freq=100, quiet=True)
        data_for_df = np.concatenate((data["accel"], data["temperature"].reshape(-1, 1), data["light"].reshape(-1, 1)), axis = 1)
        acc_df = pd.DataFrame(data_for_df, 
                              columns = ["x", "y", "z", "temperature", "light"],
                              index = pd.to_datetime(data["time"], unit = "s"))
        
        # Remove non-wear periods
        for i, row in start_stop_nw.iterrows():
            datetime_start_nw = data_cal_df.index[row["Start Datapoint"]]
            datetime_end_nw = data_cal_df.index[row["End Datapoint"]]
            data_cal_df.loc[datetime_start_nw:datetime_end_nw] = np.nan

        data_cal_df.to_parquet(save_path.replace(".bin", ".parquet"))

if __name__ == "__main__":
    data_path = "/dbfs/mnt/storageageittest/sensori-bronze/" # path to the bronze layer containing the subjects
    silver_path = "/dbfs/mnt/storageageittest/sensori-silver/" # path to the silver layer
    participants = sorted([p for p in os.listdir(data_path) if not p.startswith(".")]) # list of the participants
    participants = participants[:1]

    visit = "T0 (baseline)" # T0 (baseline), T1 (follow-up @ 6 months), T2 (follow-up @ 12 months)

    sensors = ["GeneActivPolso", "GeneActivCaviglia"]

    for participant in participants:
        print(participant)
        for sensor in sensors:
            path = os.path.join(data_path, participant, visit, sensor)
            save_path = os.path.join(silver_path, participant, visit, sensor)
            files = os.listdir(os.path.join(data_path, participant, visit, sensor))
            # if there are already parquet files, skip
            # if any([f.endswith(".parquet") for f in files]):
            #     continue
            for f in files:
                if f.endswith(".bin"):
                    try:
                        bin2parquet(os.path.join(path, f), os.path.join(save_path, f), calibrate = True)
                    except Exception as e:
                        print(f"Error with {f}: {e}")
                        continue