In [1]:
# Importando as bibliotecas
import pandas as pd
import os
import numpy as np
import matplotlib

try:
    matplotlib.use("Agg")
except Exception as e:
    print("Error: could not use Agg as backend")
    pass
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import struct
from bitstring import BitArray
from scipy import signal
from shutil import copy, rmtree

sns.set_theme() # ajusta automaticamente o estilo dos gráficos
pd.options.mode.chained_assignment = None # desabilita os avisos sobre atribuições encadeadas no Pandas
__all__ = ["SleepPy", "ColeKripke", "band_pass_filter", "activity_index", "bin2df"] # especifica os nomes dos objetos que devem ser importados quando o módulo é importado usando from module import *

O código abaixo foi retirado do pacote SleepPy (https://github.com/elyiorgos/sleeppy/blob/master/sleeppy/sleep.py). Serão utilizadas as funções para processamento dos dados armazenados no .csv extraído do arquivo .bin gerado pelo actígrafo. A extração foi feita no aplicativo do GENEActiv.

<font color=red>O código abaixo pode ter sido modificado para atender as necessidades do projeto.</font>

<font color=red>**ATENÇÃO:** todas as células devem ser executadas antes da instância do objeto da classe SleepPy, uma vez que a classe chama funções que são definidas posteriormente no código </font>

### Cópia completa da classe

Alterações feitas:

***.ix*** foi removido nas versões mais recentes do Pandas (a partir da versão 0.20.0). Para corrigir o código, .ix foi substituído por um método de indexação alternativo:

- .loc (para rótulos)
- .iloc (para índices numéricos)

**ticks.label** foi substituído por **plt.xticks** para evitar o erro: *AttributeError: 'XTick' object has no attribute 'label'*

In [28]:
class SleepPy:
    
    def __init__(
        self,
        input_file,
        results_directory,
        sampling_frequency,
        start_buffer="0s",
        stop_buffer="0s",
        start_time="",
        stop_time="",
        run_config=0,
        temperature_threshold=25.0,
        minimum_rest_block=30,
        allowed_rest_break=60,
        minimum_rest_threshold=0.0,
        maximum_rest_threshold=1000.0,
        minimum_hours=6,
        clear_intermediate_data=False,
        aws_object=None,
        verbose=False,
    ):
        
        if aws_object is not None:
            self.src = aws_object
        else:
            self.src = input_file  # save input location
        self.extension = input_file.split(".")[-1]
        self.dst = results_directory  # save output location
        self.src_name = input_file.split("/")[-1][0:-4]  # save naming convention
        self.sub_dst = (
            results_directory + "/" + self.src_name
        )  # create output directory
        self.fs = sampling_frequency  # save sampling frequency
        self.window_size = 60  # define window size in seconds
        self.band_pass_cutoff = (
            0.25,
            12.0,
        )  # define the cutoffs for the band pass filter
        self.major_rest_periods = []  # initialize a list to save the major rest periods
        self.start_buffer = start_buffer
        self.stop_buffer = stop_buffer
        self.start_time = start_time
        self.stop_time = stop_time
        self.run_config = run_config
        self.min_t = temperature_threshold
        self.minimum_rest_block = minimum_rest_block
        self.allowed_rest_break = allowed_rest_break
        self.minimum_rest_threshold = minimum_rest_threshold
        self.maximum_rest_threshold = maximum_rest_threshold
        self.minimum_hours = minimum_hours
        self.clear = clear_intermediate_data
        self.verbose = verbose
        self.run()  # run the package

    def run(self):
        """
        Runs the full package on the provided file.

        """
        try:
            os.mkdir(self.sub_dst)  # set up output directory
        except OSError:
            pass
        if self.run_config <= 0:
            # split the data into 24 hour periods
            if self.verbose:
                print("Loading data...")
            if ".bin" in self.src:
                self.split_days_geneactiv_bin()
            elif ".csv" in self.src:
                self.split_days_geneactiv_csv()
        
        if self.run_config <= 1:
            # extract the activity index feature
            if self.verbose:
                print("Extracting activity index...")
            self.extract_activity_index()
        if self.run_config <= 2:
            # run wear/on-body detection
            if self.verbose:
                print("Running off-body detection...")
            self.wear_detection()
        if self.run_config <= 3:
            # run major rest period detection
            if self.verbose:
                print("Detecting major rest period...")
            self.major_rest_period()
        if self.run_config <= 4:
            # run sleep wake predictions on the major rest period
            if self.verbose:
                print("Running sleep/wake predictions...")
            self.sleep_wake_predict()
        if self.run_config <= 5:
            # calculate endpoints based on the above predictions
            if self.verbose:
                print("Calculating endpoints...")
            self.calculate_endpoints()
        
        """ 
            if self.run_config == 6:
                # generates visual reports
                if self.verbose:
                    print("Generating visual reports...")
                self.visualize_results()
           
            # aggregate results
            if self.verbose:
                print("Aggregating results...")
            self.aggregate_results()

            # clear data
            if self.clear:
                if self.verbose:
                    print("Clearing intermediate data...")
                self.clear_data()
        """
        
    def split_days_geneactiv_csv(self):

        try:
            os.mkdir(self.sub_dst + "/raw_days")  # set up output directory
        except OSError:
            pass
        # load data and fix time_stamps
        data = pd.read_csv(
            self.src, # arquivo csv
            index_col=0,
            skiprows=100,
            header=None,
            names=["Time", "X", "Y", "Z", "LUX", "Button", "T"],
            usecols=["Time", "X", "Y", "Z", "LUX", "T"],
            dtype={
                "Time": object,
                "X": np.float64,
                "Y": np.float64,
                "Z": np.float64,
                "LUX": np.int64,
                "Button": bool,
                "T": np.float64,
            },
            low_memory=False,
        )
        data.index = pd.to_datetime(data.index, format="%Y-%m-%d %H:%M:%S:%f").values

        # remove any specified time periods from the beginning and end of the file
        data = data.loc[
            data.index[0]
            + pd.Timedelta(self.start_buffer) : data.index[-1]
            - pd.Timedelta(self.stop_buffer)
        ]

        # cut to defined start and end times if specified
        if self.start_time and self.stop_time:
            self.start_time = pd.to_datetime(
                self.start_time, format="%Y-%m-%d %H:%M:%S:%f"
            )
            self.stop_time = pd.to_datetime(
                self.stop_time, format="%Y-%m-%d %H:%M:%S:%f"
            )
            data = data.loc[self.start_time : self.stop_time]
        elif self.start_time:
            self.start_time = pd.to_datetime(
                self.start_time, format="%Y-%m-%d %H:%M:%S:%f"
            )
            data = data.loc[self.start_time :]
        elif self.stop_time:
            self.stop_time = pd.to_datetime(
                self.stop_time, format="%Y-%m-%d %H:%M:%S:%f"
            )
            data = data.loc[: self.stop_time]


        # Salvando dados em um csv unico contendo todos os dias

        raw_data = data[['X', 'Y', 'Z', 'T']]

        raw_data['Sleep'] = 0

        # transformar datetime pra hora:min
        #raw_data.index = pd.to_datetime(data.index, format="%Y-%m-%d %H:%M%f").values # convertendo datetime H:M

        # ler arquivo sleep_sum_total
        # converter sleeponset_ts e wakeup_ts para datetime hora:min
        # bater id da paciente com o id da tabela (transformar patient_id_csv para int se necessario)
            # se o id for igual, pega o valor da coluna sleeponset_ts e wakeup_ts
        # percorrer o df ate que o id do raw_data seja igual a sleeponset_ts 
            # condicao de parada: id = wakeup_ts

        a = os.path.splitext(os.path.basename(input_file_path))
        patient_id_csv = a[0].split('_')
        patient_id_csv[0] # utilizar a primeira posicao

        raw_data.to_csv(self.sub_dst + f"/raw_days/{patient_id_csv[0]}_raw_data.csv") 

        # split data into days from noon to noon
        days = data.groupby(pd.Grouper(level=0, freq="24h", offset="12h"))

        # iterate through days keeping track of the day
        count = 0
        for day in days:
            # save each 24 hour day separately if there's enough data to analyze
            df = day[1].copy()
            available_hours = (len(df) / float(self.fs)) / 3600.0
            if available_hours >= self.minimum_hours:
                count += 1
                dst = "/raw_days/{}_day_{}.h5".format(
                    self.src_name, str(count).zfill(2)
                )
                df.to_hdf(self.sub_dst + dst, key="raw_geneactiv_data_24hr", mode="w")
        return

    def split_days_geneactiv_bin(self):

        try:
            os.mkdir(self.sub_dst + "/raw_days")  # set up output directory
        except OSError:
            pass
        # load data and fix time_stamps
        data = bin2df(self.src)

        # remove any specified time periods from the beginning and end of the file
        data = data.loc[
            data.index[0]
            + pd.Timedelta(self.start_buffer) : data.index[-1]
            - pd.Timedelta(self.stop_buffer)
        ]

        # cut to defined start and end times if specified
        if self.start_time and self.stop_time:
            self.start_time = pd.to_datetime(
                self.start_time, format="%Y-%m-%d %H:%M:%S:%f"
            )
            self.stop_time = pd.to_datetime(
                self.stop_time, format="%Y-%m-%d %H:%M:%S:%f"
            )
            data = data.loc[self.start_time : self.stop_time]
        elif self.start_time:
            self.start_time = pd.to_datetime(
                self.start_time, format="%Y-%m-%d %H:%M:%S:%f"
            )
            data = data.loc[self.start_time :]
        elif self.stop_time:
            self.stop_time = pd.to_datetime(
                self.stop_time, format="%Y-%m-%d %H:%M:%S:%f"
            )
            data = data.loc[: self.stop_time]

        # split data into days from noon to noon
        days = data.groupby(pd.Grouper(level=0, freq="24h", base=12))

        # iterate through days keeping track of the day
        count = 0
        for day in days:
            # save each 24 hour day separately if there's enough data to analyze
            df = day[1].copy()
            available_hours = (len(df) / float(self.fs)) / 3600.0
            if available_hours >= self.minimum_hours:
                count += 1
                dst = "/raw_days/{}_day_{}.h5".format(
                    self.src_name, str(count).zfill(2)
                )
                df.to_hdf(self.sub_dst + dst, key="raw_geneactiv_data_24hr", mode="w")
        return

    def extract_activity_index(self):

        try:
            os.mkdir(self.sub_dst + "/activity_index_days")  # set up output directory
        except OSError:
            pass
        count = 0

        # get days

        """
            os.listdir(self.sub_dst + "/raw_days/"): Obtém uma lista de todos os arquivos e pastas no diretório raw_days, 
            que está localizado dentro do diretório self.sub_dst.

            Para cada item (arquivo ou pasta) da lista obtida acima, o código verifica se o nome do item não contém ".DS_Store" 
            (um arquivo oculto comum em sistemas macOS que não é relevante para processamento).
            Se o arquivo/pasta não contiver ".DS_Store", ele é adicionado à lista com o caminho completo self.sub_dst + "/raw_days/" + i.

            sorted(): Após gerar a lista dos caminhos completos dos arquivos (exceto aqueles que contêm ".DS_Store"), 
            a lista é ordenada em ordem alfabética.
        """
        days = sorted(
            [
                self.sub_dst + "/raw_days/" + i
                for i in os.listdir(self.sub_dst + "/raw_days/")
                #if ".DS_Store" not in i
                if i.endswith(".h5") and ".DS_Store" not in i
            ]
        )

        all_activity_dataframes = []

        for day in days:
            count += 1

            # load data
            df = pd.read_hdf(day)
            activity = [] # usada para armazenar os índices de atividade para cada janela de tempo
            header = ["Time", "activity_index"]
            idx = 0
            window = int(self.window_size * self.fs) # tamanho da janela de processamento
            incrementer = int(self.window_size * self.fs) # passo de avanço para o processamento da próxima janela

            # iterate through windows
            """
                1. Uma janela de dados temp é extraída das colunas "X", "Y" e "Z" para os sensores em um intervalo de idx até idx + window.
                2. O tempo inicial da janela é armazenado em start_time.
                3. O índice da janela é redefinido (temp.index = range(...)).
            """
            while idx < len(df) - incrementer:
                # preprocessing: BP Filter
                temp = df[["X", "Y", "Z"]].iloc[idx : idx + window]
                start_time = temp.index[0]
                temp.index = range(len(temp.index))  # reset index
                
                temp = band_pass_filter(
                    temp, self.fs, bp_cutoff=self.band_pass_cutoff, order=3
                    # filtro remove ruídos fora de uma faixa de frequência de interesse, definida por self.band_pass_cutoff
                )

                # activity index extraction
                bp_channels = [
                    i for i in temp.columns.values[1:] if "bp" in i
                ]  # band pass filtered channels
                activity.append(
                    [
                        start_time,
                        activity_index(temp, channels=bp_channels).values[0][0],
                        # calcula um índice de atividade para os canais filtrados (com bp no nome)
                    ]
                )
                idx += incrementer # avança a janela

            # save data
            activity = pd.DataFrame(activity)
            activity.columns = header
            activity.set_index("Time", inplace=True)
            activity_df_csv = activity
            # O caminho de destino é construído com base no nome da origem (self.src_name) e no número 
            # do dia, que é formatado com dois dígitos (str(count).zfill(2)).
            dst = "/activity_index_days/{}_activity_index_day_{}.h5".format(
                self.src_name, str(count).zfill(2)
            )
            activity.to_hdf(
                self.sub_dst + dst, key="activity_index_data_24hr", mode="w"
            )


            ### CRIANDO ARQUIVO CSV ACTIVITY INDEX
            # aggregate and save the major rest period for each day
            #dst_activity = "{}_activity_index.csv".format(self.src_name)
            #activity_df_csv.to_csv("activity_index.csv")
            #activity_df_csv.to_csv(self.sub_dst + "/activity_index_days/{}_activity_index_day_{}.csv".format(
            #                        self.src_name, str(count).zfill(2))
            #)
            all_activity_dataframes.append(activity)

        a = os.path.splitext(os.path.basename(input_file_path))
        patient_id_csv = a[0].split('_')
        patient_id_csv[0] 

        all_activity_data = pd.concat(all_activity_dataframes)
        all_activity_data.to_csv(self.sub_dst + f"/activity_index_days/{patient_id_csv[0]}_activity_index.csv")

    def wear_detection(self):
     
        try:
            os.mkdir(self.sub_dst + "/wear_detection")  # set up output directory
        except OSError:
            pass
        count = 0 # identifica o dia atual

        # get days
        days = sorted(
            [
                self.sub_dst + "/raw_days/" + i
                for i in os.listdir(self.sub_dst + "/raw_days/")
                #if ".DS_Store" not in i
                if i.endswith(".h5") and ".DS_Store" not in i
            ]
        )
        for day in days:
            df = pd.read_hdf(day)[["X", "Y", "Z"]]
            count += 1

            # get std based classification criteria
            df_std = self.roll_std_60_minute(df) # calcula o desvio padrão dos sinais em janelas de 60 minutos

            """
                Se o desvio padrão for maior ou igual a 0.013, a janela é marcada como "usada" (1).
                Se for menor que 0.013, é marcada como "não usada" (0).
                df_std: soma dos valores ao longo dos eixos do sensor para cada linha (ponto no tempo).
            """
            df_std[df_std >= 0.013] = 1
            df_std[df_std < 0.013] = 0
            df_std = df_std.sum(axis=1)

            # get range based classification criteria
            df_range = self.roll_max_range_60_minute(df) # calcula o intervalo máximo (diferença entre máximos e mínimos)
            df_range[df_range >= 0.15] = 1
            df_range[df_range < 0.15] = 0
            df_range = df_range.sum(axis=1)

            # classify
            """
                df_wear: armazena a classificação do uso do sensor. Inicialmente, todas as entradas são definidas como "usado" (1).
                Para cada linha (ponto no tempo), se ambos os critérios (desvio padrão e intervalo) forem baixos (<= 1), o valor é marcado como "não usado" (0).
            """
            df_wear = pd.DataFrame(df_std.copy()) * 0 + 1
            df_wear.columns = ["wear"]

            all_wear_dataframes = []
            
            for i in range(len(df_wear)):
                if df_range.iloc[i] <= 1 or df_std.iloc[i] <= 1:  # Mudança de ix para iloc
                    df_wear.iloc[i] = 0  # Mudança de ix para iloc

            # save before rescoring
            df_wear_csv = df_wear

            df_wear.to_hdf(
                self.sub_dst
                + "/wear_detection/wear_detection_day_{}.h5".format(
                    str(count).zfill(2)
                ),
                key="wear_detection_24hr",
                mode="w",
            ) # resultados iniciais são salvos antes da reclassificacao

            # apply rescoring
            df_wear = self.rescore(df_wear)
            df_wear = self.rescore(df_wear)
            df_wear = self.rescore(df_wear)
            if count == len(days):
                df_wear = self.rescore_last_day(df_wear) # reclassificacao especial para o ultimo dia

            
            #df_wear_resc_csv = df_wear
            # save post rescoring
            df_wear.to_hdf(
                self.sub_dst
                + "/wear_detection/wear_detection_rescored_day_{}.h5".format(
                    str(count).zfill(2)
                ),
                key="wear_detection_rescored_24hr",
                mode="w",
            )


            #df_wear_csv.to_csv(self.sub_dst + "/wear_detection/wear_detection_day_{}.csv".format(
            #                        str(count).zfill(2))
            #)
            """ 
            df_wear_resc_csv.to_csv(self.sub_dst + "/wear_detection/wear_detection_resc_day_{}.csv".format(
                                    str(count).zfill(2))
            )  
            """
            all_wear_dataframes.append(df_wear_csv)

        a = os.path.splitext(os.path.basename(input_file_path))
        patient_id_csv = a[0].split('_')
        patient_id_csv[0] 

        all_wear_data = pd.concat(all_wear_dataframes)
        all_wear_data.to_csv(self.sub_dst + f"/wear_detection/{patient_id_csv[0]}_wear_detection.csv")
    
    def major_rest_period(self):

        try:
            os.mkdir(self.sub_dst + "/major_rest_period")  # set up output directory
        except OSError:
            pass
        count = 0
        mrps = []
        header = ["day", "major_rest_period", "available_hours"]

        # get days
        days = sorted(
            [
                self.sub_dst + "/raw_days/" + i
                for i in os.listdir(self.sub_dst + "/raw_days/")
                #if ".DS_Store" not in i
                if i.endswith(".h5") and ".DS_Store" not in i
            ]
        )
        for day in days:
            # DATAFRAME DF CRIADO AQUI --------
            df = pd.read_hdf(day)
            df = df[["X", "Y", "Z", "T"]]
            available_hours = (len(df) / float(self.fs)) / 3600.0 # numero de horas disponíveis de dados para o dia
            count += 1

            # process data
            """
                O código aplica uma mediana móvel de 5 segundos nos dados para suavizar os sinais.
                O ângulo do sensor é calculado usando os valores de aceleração nas direções X, Y e Z.
                Os ângulos são reamostrados para calcular a média a cada 5 segundos. 
            """

            df = df.rolling(int(5 * self.fs)).median()  # run rolling median 5 second
            
            df["angle"] = np.arctan(
                df["Z"] / ((df["X"] ** 2 + df["Y"] ** 2) ** 0.5)
            ) * (
                180.0 / np.pi
            )  # get angle

            df = (
                df[["angle", "T"]].resample("5s").mean().fillna(0) # reamostrando para calcular a media de 5s
            )  # get 5 second average

            # save intermediate data for plotting
            df["angle"].to_hdf(
                self.sub_dst
                + "/major_rest_period/5_second_average_arm_angle_day_{}.h5".format(
                    str(count).zfill(2)
                ),
                key="arm_angle_data_24hr",
                mode="w",
            )

            """ 
                O código calcula a diferença absoluta entre o ângulo atual e o anterior. Esse valor reflete mudanças de posição.
                Aplica uma mediana móvel de 60 minutos nos dados do ângulo e da temperatura.
            """

            df["angle"] = np.abs(
                df["angle"] - df["angle"].shift(1) # indica mudança de posicao
            )  # get absolute difference
            df_angle = self.roll_med(df["angle"], 60)  # run rolling median 5 minute
            df_temp = self.roll_med(df["T"], 60)  # run rolling median 5 minute

            # calculate and apply threshold
            thresh = np.min(
                [
                    np.max(
                        [
                            np.percentile(df_angle.Data.dropna().values, 10) * 15.0,
                            self.minimum_rest_threshold,
                        ]
                    ),
                    self.maximum_rest_threshold,
                ]
            )
            df_angle.Data[df_angle.Data < thresh] = 0  # apply threshold
            df_angle.Data[df_angle.Data >= thresh] = 1  # apply threshold

            # drop rest periods where temperature is below the temp threshold
            df_angle.Data[df_temp.Data <= self.min_t] = 1
            df = df_angle

            # BLOCOS DE DESCANSO
            # drop rest blocks < minimum_rest_block minutes (except first and last)
            """     
                Identifica os "blocos" de dados contínuos em df.Data. Ela faz isso calculando a 
                diferença entre os valores consecutivos em df.Data. Se a diferença não for zero (ne(0)), 
                significa que há uma transição entre um bloco de dados ativo e inativo ou vice-versa. 
                cumsum() acumula essas transições, rotulando cada bloco com um número diferente.
            """
            df["block"] = (df.Data.diff().ne(0)).cumsum()
            groups, iter_count = df.groupby(by="block"), 0 # agrupa com base nos grupos identificados
            for group in groups:
                iter_count += 1
                if iter_count == 1 or iter_count == len(groups): # bloco inicial e final sao ignorados
                    continue
                if (
                    group[1]["Data"].sum() == 0
                    and len(group[1]) < 12 * self.minimum_rest_block
                ):
                    df.Data[group[1].index[0] : group[1].index[-1]] = 1 # descanso muito curto passa a ser atividade 

           # BLOCOS DE ATIVIDADE
            # drop active blocks < allowed_rest_break minutes (except first and last)
            df["block"] = (df.Data.diff().ne(0)).cumsum()
            groups, iter_count = df.groupby(by="block"), 0
            for group in groups:
                iter_count += 1
                if iter_count == 1 or iter_count == len(groups):
                    continue
                if (
                    len(group[1]) == group[1]["Data"].sum()
                    and len(group[1]) < 12 * self.allowed_rest_break
                ):
                    df.Data[group[1].index[0] : group[1].index[-1]] = 0

            # get longest block
            df["block"] = (df.Data.diff().ne(0)).cumsum()
            best = 0 # armazena o comprimento do maior periodo encontrado ate o momento
            mrp = [] # armazena os horarios de inicio e fim do maior bloco
            for group in df.groupby(by="block"):
                # verifica se o bloco é um período de descanso e se o comprimento do bloco atual é maior que o maior bloco de descanso encontrado até o momento
                if group[1]["Data"].sum() == 0 and len(group[1]) > best:
                    best = len(group[1])
                    mrp = [group[1].index[0], group[1].index[-1] + pd.Timedelta("5m")] # add 5 min no ultimo registro para garantir que esta completamente representado

            # save predictions
            df.drop(columns=["block"], inplace=True)
            df.to_hdf(
                self.sub_dst
                + "/major_rest_period/rest_periods_day_{}.h5".format(
                    str(count).zfill(2)
                ),
                key="rest_period_data_24hr",
                mode="w"
            )

            mrps.append([count, mrp, available_hours])

        # aggregate and save the major rest period for each day
        mrps = pd.DataFrame(mrps)
        mrps.columns = header
        mrps.set_index("day", inplace=True)
        dst = "/major_rest_period/{}_major_rest_periods.csv".format(self.src_name)
        mrps.to_csv(self.sub_dst + dst)

    def sleep_wake_predict(self):

        try:
            os.mkdir(
                self.sub_dst + "/sleep_wake_predictions"
            )  # set up output directory
        except OSError:
            pass
        count = 0

        # get days
        days = sorted(
            [
                self.sub_dst + "/activity_index_days/" + i
                for i in os.listdir(self.sub_dst + "/activity_index_days/")
                #if ".DS_Store" not in i
                if i.endswith(".h5") and ".DS_Store" not in i  # Verifica a extensão .hdf
            ]
        )
        
        all_predictions_dataframes = []

        for day in days:
            count += 1
            # DATAFRAME DF CRIADO -------
            df = pd.read_hdf(day) 

            # run the sleep wake predictions
            ck = ColeKripke(df.activity_index)
            df["sleep_predictions"] = ck.predict() # cria uma coluna de predicoes no df

            # save predictions
            df.drop(inplace=True, columns=["activity_index"])
            df.to_hdf(
                self.sub_dst
                + "/sleep_wake_predictions/sleep_wake_day_{}.h5".format(
                    str(count).zfill(2)
                ),
                key="sleep_wake_data_24hr",
                mode="w",
            )

            all_predictions_dataframes.append(df)
        
        a = os.path.splitext(os.path.basename(input_file_path))
        patient_id_csv = a[0].split('_')
        patient_id_csv[0]

        all_predictions_data = pd.concat(all_predictions_dataframes)
        all_predictions_data.to_csv(self.sub_dst + f"/sleep_wake_predictions/{patient_id_csv[0]}_ck_predictions.csv")

    def calculate_endpoints(self):

        try:
            os.mkdir(self.sub_dst + "/sleep_endpoints")  # set up output directory
        except OSError:
            pass
        count = 0

        # get days
        days = sorted(
            [
                self.sub_dst + "/sleep_wake_predictions/" + i
                for i in os.listdir(self.sub_dst + "/sleep_wake_predictions/")
                #if ".DS_Store" not in i
                if i.endswith(".h5") and ".DS_Store" not in i
            ]
        )

        # get major rest periods for each day
        mrps = pd.read_csv(
            self.sub_dst
            + "/major_rest_period/{}_major_rest_periods.csv".format(self.src_name),
            parse_dates=True,
            index_col="day",
        )
        endpoints = []
        for day in days:
            count += 1
            df = pd.read_hdf(day) # hdf das predicoes
            # get and format times
            times = mrps.loc[count].major_rest_period # extrai o MRP para o dia atual

            """    
                Este bloco tenta formatar o período de descanso (armazenado como uma string) para 
                um formato de tempo válido para indexação do DataFrame df. Ele usa o método eval() para 
                avaliar a string e converte os tempos em valores de índice. Em seguida, seleciona os 
                dados de predições de sono e vigília apenas dentro do período de maior descanso.
            """
            try:
                idt = times.index("[T")
                times = times[: idt + 1] + "pd." + times[idt + 1 :]
                idt = times.index(", ")
                times = times[: idt + 2] + "pd." + times[idt + 2 :]
                times = eval(times)
                df = df.loc[times[0] : times[1]]
            except ValueError:
                pass

            # PARAMETROS IMPORTANTES PARA AVALIACAO DO SONO  -----------------------------------------
            # get total sleep time
            tst = len(df) - sum(df.values)

            # get percent time asleep
            pct_time_sleep = 100.0 * (len(df) - sum(df.values)) / float(len(df))

            # get wake after sleep onset
            waso = df.loc[df.idxmin()[0] :]
            waso = waso.sum()[0]

            # get sleep onset latency
            sleep_onset_lat = (df.idxmin()[0] - df.index[0]).total_seconds() / 60.0

            # number of wake bouts
            num_wake_bouts = 0
            wake_bout_df = df.copy()
            wake_bout_df["block"] = (
                wake_bout_df.sleep_predictions.diff().ne(0)
            ).cumsum()
            for group in wake_bout_df.groupby(by="block"):
                if group[1]["sleep_predictions"].sum() > 0:
                    num_wake_bouts += 1
            endpoints.append(
                [
                    int(count),
                    int(tst[0]),
                    int(np.round(pct_time_sleep[0])),
                    int(waso),
                    int(sleep_onset_lat),
                    int(num_wake_bouts),
                ]
            )

        # build and save output dataframe
        hdr = [
            "day",
            "total_sleep_time",
            "percent_time_asleep",
            "waso",
            "sleep_onset_latency",
            "number_wake_bouts",
        ]
        endpoints = pd.DataFrame(endpoints)
        endpoints.columns = hdr
        endpoints.set_index(endpoints.day, inplace=True)
        endpoints.drop(columns="day", inplace=True)
        endpoints.to_csv(self.sub_dst + "/sleep_endpoints/sleep_endpoints_summary.csv")

    def roll_med(self, df, num_samples):

        # initialize indexer and rolling median list
        idx = 0 # percorre o df
        med = [] # armazena medianas
        while idx < len(df) - num_samples:
            med.append(
                [df.index[idx], df.iloc[idx : idx + num_samples].median()]
            )  # get start index, std value
            idx += 1
        # format data frame
        df = pd.DataFrame(med, columns=["Time", "Data"])
        df.set_index(df.Time, inplace=True)
        df.drop(inplace=True, columns="Time")
        return df

    def roll_std_60_minute(self, df):
        # initialize indexer and rolling std list
        idx = 0
        rstd = [] # resultados do std

        # calculate std for all windows
        # 900s = 15min
        while idx < len(df) - int(900 * self.fs):  # run until we reach the end
            xyz = (
                df.iloc[idx : idx + int(3600 * self.fs)].std().values
            )  # save std in x y and z
            rstd.append(
                # Armazena os valores de desvio padrao para as colunas X, Y e Z
                [df.index[idx], xyz[0], xyz[1], xyz[2]]
            )  # get start index of window, std values
            idx += int(900 * self.fs)  # increment indexer by 15 minutes

        # format dataframe
        df = pd.DataFrame(rstd, columns=["Time", "X", "Y", "Z"])
        df.set_index(df.Time, inplace=True)
        df.drop(inplace=True, columns="Time")
        return df

    def roll_max_range_60_minute(self, df):
        # initialize indexer and rolling range list
        idx = 0
        rr = []

        # calculate range for all windows
        while idx < len(df) - int(900 * self.fs):  # run until we reach the end
            xyz = (
                df.iloc[idx : idx + int(3600 * self.fs)].max().values
                - df.iloc[idx : idx + int(3600 * self.fs)].min().values
            )  # save range in x y z
            rr.append(
                [df.index[idx], xyz[0], xyz[1], xyz[2]]
            )  # get start index of window, range values
            idx += int(900 * self.fs)  # increment indexer by 15 minutes

        # format dataframe
        df = pd.DataFrame(rr, columns=["Time", "X", "Y", "Z"])
        df.set_index(df.Time, inplace=True)
        df.drop(inplace=True, columns="Time")
        return df

    def visualize_results(self):
        """
        Generates reports to visualize endpoint summary and day to day endpoint behaviors.
        """
        try:
            os.mkdir(self.sub_dst + "/reports")  # set up output directory
        except OSError:
            pass
        # raw (all 3 axes, temp, light)
        rdays = sorted(
            [
                self.sub_dst + "/raw_days/" + i
                for i in os.listdir(self.sub_dst + "/raw_days/")
                if ".DS_Store" not in i
            ]
        )
        # wear (no rescoring)
        wdays = sorted(
            [
                self.sub_dst + "/wear_detection/" + i
                for i in os.listdir(self.sub_dst + "/wear_detection/")
                if "rescored" not in i and ".h5" in i and ".DS_Store" not in i
            ]
        )
        # wear (with rescoring)
        wdays_re = sorted(
            [
                self.sub_dst + "/wear_detection/" + i
                for i in os.listdir(self.sub_dst + "/wear_detection/")
                if "rescored" in i and ".DS_Store" not in i
            ]
        )
        # major rest (arm angle)
        mrdays_aa = sorted(
            [
                self.sub_dst + "/major_rest_period/" + i
                for i in os.listdir(self.sub_dst + "/major_rest_period/")
                if "angle" in i and ".DS_Store" not in i
            ]
        )
        # major rest (periods)
        mrdays_rp = sorted(
            [
                self.sub_dst + "/major_rest_period/" + i
                for i in os.listdir(self.sub_dst + "/major_rest_period/")
                if "angle" not in i and "h5" in i and ".DS_Store" not in i
            ]
        )
        # activity index (full 24 hours)
        aidays = sorted(
            [
                self.sub_dst + "/activity_index_days/" + i
                for i in os.listdir(self.sub_dst + "/activity_index_days/")
                if ".DS_Store" not in i
            ]
        )
        # sleep wake (full 24 hours)
        swdays = sorted(
            [
                self.sub_dst + "/sleep_wake_predictions/" + i
                for i in os.listdir(self.sub_dst + "/sleep_wake_predictions/")
                if ".DS_Store" not in i
            ]
        )
        # endpoints (graphs/charts per day)
        endpoints = pd.read_csv(
            self.sub_dst + "/sleep_endpoints/sleep_endpoints_summary.csv",
            index_col="day",
        )

        days = range(0, len(rdays))
        for day in days:
            # read the raw data, downsample for plotting
            raw = pd.read_hdf(rdays[day])
            raw = raw.resample("60s").median()

            # get shared index
            idx = pd.date_range(
                start=raw.index[0].replace(hour=12, minute=0, second=0, microsecond=0),
                periods=1440,
                freq="60s",
            )
            raw = raw.reindex(idx, fill_value=float("nan"))

            # read the wear data, resample and match index with the raw data
            wear = pd.read_hdf(wdays[day])  # 15 minute period
            wear[wear == 0] = float("nan")
            wear = wear.resample("60s").ffill()
            wear = wear.reindex(idx, fill_value=float("nan"))

            # read the wear data with rescoring, resample and match the raw index
            wear_re = pd.read_hdf(wdays_re[day])  # 15 minute period
            wear_re[wear_re == 0] = float("nan")
            wear_re = wear_re.resample("60s").ffill()
            wear_re = wear_re.reindex(idx, fill_value=float("nan"))

            # read the arm angle data, resample and match the raw index
            angle = pd.read_hdf(mrdays_aa[day])  # 5 second period
            angle = angle.resample("60s").max()
            angle = angle.reindex(idx, fill_value=float("nan"))

            # read the major rest period data, resample and match the raw index
            periods = pd.read_hdf(mrdays_rp[day])  # 5 second period
            periods[periods == 1] = float("nan")
            periods[periods == 0] = 1
            periods = periods.resample("60s").max()
            periods = periods.reindex(idx, fill_value=float("nan"))

            # read the acvitity index data, resample and match the raw index
            aindex = pd.read_hdf(aidays[day])  # 1 minute period
            aindex = aindex.resample("60s").max()
            aindex = aindex.reindex(idx, fill_value=float("nan"))

            # read the sleep wake predictions, resample and match the raw index
            swake = pd.read_hdf(swdays[day])  # 1 minute period
            swake[swake == 0] = float("nan")
            swake = swake.resample("60s").max()
            swake = swake.reindex(idx, fill_value=float("nan"))

            # build a dataframe for plotting certain data streams as straight lines
            df = swake.copy()
            df.columns = ["wake"]
            df["rest periods"] = periods.values - 0.05
            df["on body"] = wear.values - 0.1
            df["on body(rescore)"] = wear_re.values - 0.15
            swake, wear, wear_re, periods = [], [], [], []

            # get day endpoints for plotting of table
            t_labels = (
                "Total Sleep Time(minutes)",
                "Percent Time Asleep",
                "Wake After Sleep Onset(minutes)",
                "Sleep Onset Latency(minutes)",
                "Number of Wake Bouts",
            )
            t_vals = [endpoints.loc[day + 1].values]

            # plotting
            fig, (axt, ax0, ax1, ax2, ax3, ax4, ax5) = plt.subplots(
                7, 1, figsize=(30, 15)
            )
            plt.suptitle(
                "Visual Report for Source: {}\nDay: {}\nDate: {}".format(
                    self.src_name, day + 1, idx[0].date()
                ),
                fontsize=25,
            )
            hours = mdates.HourLocator(interval=1)
            h_fmt = mdates.DateFormatter("%H:%M")
            all_axes = (ax0, ax1, ax2, ax3, ax4, ax5)

            # plot table
            tbl = axt.table(
                cellText=t_vals,
                colLabels=t_labels,
                cellLoc="center",
                rowLoc="center",
                loc="center",
                fontsize=20,
            )
            tbl.auto_set_font_size(False)
            tbl.set_fontsize(24)
            tbl.scale(1.1, 2.4)
            axt.axis("off")

            # plot raw
            raw.rename(columns={"T": "Temperature", "LUX": "Light"}, inplace=True)
            raw[["X", "Y", "Z"]].plot(ax=ax0, lw=1).legend(
                bbox_to_anchor=(0, 1), fontsize=20
            )
            ax0.set_ylabel("")
            ax0.set_xlabel("")

            # plot temperature
            raw[["Temperature"]].plot(
                ax=ax1, lw=1, color=sns.xkcd_rgb["pale red"]
            ).legend(bbox_to_anchor=(0, 1), fontsize=20)
            ax1.axhline(y=self.min_t, color="r", linestyle="--", lw=2)
            props = dict(boxstyle="round", facecolor="lavender", alpha=0.35)
            textstr = u"max: {}\xb0C\nmin: {}\xb0C\nthresh: {}\xb0C".format(
                raw[["Temperature"]].max().values[0],
                raw[["Temperature"]].min().values[0],
                self.min_t,
            )
            ax1.text(
                0.005,
                0.95,
                textstr,
                transform=ax1.transAxes,
                fontsize=14,
                verticalalignment="top",
                bbox=props,
            )
            ax1.set_ylabel("")
            ax1.set_xlabel("")

            # plot light
            raw[["Light"]].plot(ax=ax2, lw=1, color=sns.xkcd_rgb["pale orange"]).legend(
                bbox_to_anchor=(0, 1), fontsize=20
            )
            ax2.set_ylabel("")
            ax2.set_xlabel("")

            # plot activity index
            aindex.plot(ax=ax3, lw=1, color="#6fc276").legend(
                labels=["activity"], bbox_to_anchor=(0, 0.75), fontsize=20
            )
            ax3.set_ylabel("")
            ax3.set_xlabel("")

            # plot arm angle
            angle.plot(ax=ax4, lw=1, color="#b36ff6").legend(
                labels=["arm angle"], bbox_to_anchor=(0, 0.75), fontsize=20
            )
            ax4.set_ylabel("")
            ax4.set_xlabel("")

            # plot dataframe of 4 streams
            df.plot(ax=ax5, lw=8, x_compat=True).legend(
                bbox_to_anchor=(0, 1.3), fontsize=20
            )
            ax5.set_ylabel("")
            ax5.set_xlabel("")

            # plot formatting
            plt.draw()
            count = 0
            for ax in all_axes:
                count += 1
                ax.spines["top"].set_visible(False)
                ax.spines["right"].set_visible(False)
                ax.spines["bottom"].set_visible(False)
                ax.spines["left"].set_visible(False)
                ax.grid(False)
                if count < 6:
                    ax.get_xaxis().set_ticks([])
                ax.get_yaxis().set_ticks([])
            ax5.xaxis.set_major_locator(hours)
            ax5.xaxis.set_major_formatter(h_fmt)
            plt.subplots_adjust(wspace=0, hspace=0)
            fig.autofmt_xdate()
            for tick in ax5.xaxis.get_major_ticks():
                #tick.label.set_fontsize(16)
                plt.xticks(fontsize=16)
            plt.savefig(
                self.sub_dst + "/reports/Visual_Results_Day_{}.pdf".format(day + 1)
            )
            plt.close()

        # generate a summary plot from endpoint data
        fig, (ax0, ax1, ax2, ax3, ax4) = plt.subplots(5, 1, figsize=(12, 12))
        plt.suptitle("Summary Report for Source: {}".format(self.src_name), fontsize=16)
        all_axes = (ax0, ax1, ax2, ax3, ax4)
        ylabels = [
            "Total Sleep\nTime(min)\nMean: {}".format(
                int(np.round(endpoints.total_sleep_time.mean()))
            ),
            "Percent Time\nAsleep\nMean: {}".format(
                int(np.round(endpoints.percent_time_asleep.mean()))
            ),
            "Wake After\nSleep Onset(min)\nMean: {}".format(
                int(np.round(endpoints.waso.mean()))
            ),
            "Sleep Onset\nLatency(min)\nMean: {}".format(
                int(np.round(endpoints.sleep_onset_latency.mean()))
            ),
            "Number of\nWake Bouts\nMean: {}".format(
                int(np.round(endpoints.number_wake_bouts.mean()))
            ),
        ]

        # plot total sleep time
        endpoints.total_sleep_time.plot.bar(ax=ax0, title="")
        # plot percent time asleep
        endpoints.percent_time_asleep.plot.bar(ax=ax1, title="")
        # plot wake after sleep onset
        endpoints.waso.plot.bar(ax=ax2, title="")
        # plot sleep onset latency
        endpoints.sleep_onset_latency.plot.bar(ax=ax3, title="")
        # plot the number of wake bouts
        endpoints.number_wake_bouts.plot.bar(ax=ax4, title="")
        # plot formatting
        count = 0
        for ax in all_axes:
            count += 1
            ax.spines["top"].set_visible(False)
            ax.spines["right"].set_visible(False)
            ax.spines["bottom"].set_visible(False)
            ax.spines["left"].set_visible(False)
            ax.grid(False)
            ax.set_ylabel(ylabels[count - 1], rotation=0, fontsize=12, labelpad=50)
            if count < 5:
                ax.set_xlabel("")
                ax.get_xaxis().set_ticks([])
                ax.get_yaxis().set_ticks([])
            else:
                ax.set_xlabel("Day", fontsize=20)
                ax.get_yaxis().set_ticks([])
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
            for p in ax.patches:
                ax.annotate(
                    np.round(p.get_height(), decimals=2),
                    (p.get_x() + p.get_width() / 2.0, 0),
                    ha="center",
                    va="center",
                    xytext=(0, 10),
                    textcoords="offset points",
                    fontweight="bold",
                )
        plt.subplots_adjust(wspace=0, hspace=0.01)
        plt.xticks(fontsize=20)
        plt.draw()
        plt.savefig(self.sub_dst + "/reports/Summary_Report.pdf")
        plt.close()

    def rescore(self, df):

        # group classifications into wear and nonwear blocks

        """      
            df["block"]: Cria uma coluna que identifica blocos de períodos consecutivos de uso (wear) ou 
            não uso (nonwear). A função diff().ne(0) detecta mudanças nos valores de uso e cumsum() cria um 
            contador cumulativo para distinguir blocos.
        """
        df["block"] = (df.wear.diff().ne(0)).cumsum()
        blocks = list(df.groupby("block"))

        # iterate through blocks
        for i in range(1, len(blocks) - 1):
            wear = blocks[i][1]["wear"].values[
                0
            ]  # get whether or not the block is wear
            if wear:
                # get hour lengths of the previous, current, and next blocks
                #calculo da duracao dos bloocos adjacentes
                prev, current, post = (
                    len(blocks[i - 1][1]) * 0.25, # multiplicado por 0.25 para representar os 15 min
                    len(blocks[i][1]) * 0.25,
                    len(blocks[i + 1][1]) * 0.25,
                )
                # if the current block is less than 3 hours and the ratio to previous and post blocks is less than 80%
                if current < 3 and current / (prev + post) < 0.8:
                    df["wear"][
                        df.block == blocks[i][0]
                    ] = 0  # rescore the wear period as non wear
                # if the current block is less than 6 hours and the ratio to previous and post blocks is less than 30%
                elif current < 6 and current / (prev + post) < 0.3:
                    df["wear"][
                        df.block == blocks[i][0]
                    ] = 0  # rescore the wear period as non wear
        df.drop(columns=["block"], inplace=True)
        return df

    def rescore_last_day(self, df):
        # group classifications into wear and nonwear blocks
        df["block"] = (df.wear.diff().ne(0)).cumsum()
        blocks = list(df.groupby("block"))

        # get the start index of the last day
        last_day_index = df.index[-1] - pd.to_timedelta("24h")

        # iterate through blocks
        for i in range(1, len(blocks)):
            wear = blocks[i][1]["wear"].values[
                0
            ]  # get whether or not the block is wear
            if (
                wear and blocks[i][1].index[0] > last_day_index
            ):  # if wear, and it's the last day
                # get hour lengths of the previous and current blocks
                prev, current = len(blocks[i - 1][1]) * 0.25, len(blocks[i][1]) * 0.25
                # if the current block is less than 3 hours and the previous block is greater or equal to 1 hour
                if current < 3 and prev >= 1:
                    df["wear"][
                        df.block == blocks[i][0]
                    ] = 0  # rescore the wear period as non wear
        df.drop(columns=["block"], inplace=True)
        return df

    def aggregate_results(self):
        """
        Aggregates all results in a single folder.

        """
        try:
            os.mkdir(self.sub_dst + "/results")  # set up output directory
        except OSError:
            pass

        # collect results files
        srcs = []
        srcs += [
            self.sub_dst + "/reports/" + x
            for x in os.listdir(self.sub_dst + "/reports")
            if ".DS_Store" not in x
        ]
        srcs += [
            self.sub_dst + "/major_rest_period/" + x
            for x in os.listdir(self.sub_dst + "/major_rest_period")
            if ".csv" in x and ".DS_Store" not in x
        ]
        srcs += [
            self.sub_dst + "/sleep_endpoints/" + x
            for x in os.listdir(self.sub_dst + "/sleep_endpoints")
            if ".DS_Store" not in x
        ]

        # aggregate
        for src in srcs:
            copy(src, self.sub_dst + "/results")

    def clear_data(self):
        """
        Clears all intermediate data, keeping only results.

        """
        # collect directories for deletion
        direcs = [
            os.path.join(self.sub_dst, x)
            for x in os.listdir(self.sub_dst)
            if "results" not in x and ".DS_Store" not in x
        ]

        # delete
        for direc in direcs:
            rmtree(direc)


As seguintes funções são globais, utilizadas por mais de uma classe.

In [3]:

def band_pass_filter(
    data_df, sampling_rate, bp_cutoff, order, channels=["X", "Y", "Z"]
):
    """
    Band-pass filter a given sensor signal.

    :param data_df: dataframe housing sensor signals
    :param sampling_rate: sampling rate of signal
    :param bp_cutoff: filter cutoffs
    :param order: filter order
    :param channels: channels of signal to filter
    :return: dataframe of raw and filtered data
    """
    data = data_df[channels].values

    # Calculate the critical frequency (radians/sample) based on cutoff frequency (Hz) and sampling rate (Hz)
    critical_frequency = [
        bp_cutoff[0] * 2.0 / sampling_rate,
        bp_cutoff[1] * 2.0 / sampling_rate,
    ]

    # Get the numerator (b) and denominator (a) of the IIR filter
    [b, a] = signal.butter(
        N=order, Wn=critical_frequency, btype="bandpass", analog=False
    )

    # Apply filter to raw data
    bp_filtered_data = signal.filtfilt(b, a, data, padlen=10, axis=0)

    new_channel_labels = [ax + "_bp_filt_" + str(bp_cutoff) for ax in channels]

    data_df[new_channel_labels] = pd.DataFrame(bp_filtered_data)

    return data_df


def activity_index(signal_df, channels=["X", "Y", "Z"]):
    """
    Compute activity index of sensor signals.

    :param signal_df: dataframe housing desired sensor signals
    :param channels: channels of signal to compute activity index
    :return: dataframe housing calculated activity index
    """
    ai_df = pd.DataFrame()

    """
        np.var(signal_df[channels], axis=0): Calcula a variância de cada um dos canais fornecidos, 
        ao longo das linhas do DataFrame (axis=0 significa que a variância é calculada ao longo de cada coluna).
        .mean() ** 0.5 = desvio padrao

    """
    ai_df["activity_index"] = [np.var(signal_df[channels], axis=0).mean() ** 0.5]
    return ai_df # armazena o valor do índice de atividade


def bin2df(full_path):
    """

    Reads geneactiv .bin files into a pandas dataframe

    :param full_path: full path to geneactiv .bin file

    :return decode: pandas dataframe of GA data

    """
    with open(full_path, "rb") as in_file:
        full_line = in_file.readline() # le a primeira linha 
        count = 0 # contador de linhas lidas
        fs = "" # frequencia de amostragem
        df = [] # armazena dfs intermediarios
        while full_line:
            full_line = in_file.readline()
            line = full_line[:].split("\r\n")[0] # remove quebra de linha
            count += 1
            if count < 60: # parametros de calibracao
                if "x gain" in line:
                    x_gain = int(line.split(":")[-1])

                if "x offset" in line:
                    x_offset = int(line.split(":")[-1])

                if "y gain" in line:
                    y_gain = int(line.split(":")[-1])

                if "y offset" in line:
                    y_offset = int(line.split(":")[-1])

                if "z gain" in line:
                    z_gain = int(line.split(":")[-1])

                if "z offset" in line:
                    z_offset = int(line.split(":")[-1])

                if "Volts" in line:
                    volts = int(line.split(":")[-1])

                if "Lux" in line:
                    lux = int(line.split(":")[-1])

            if "Page Time:" in line: # processamento do tempo
                time = pd.to_datetime(
                    ":".join(line.split(":")[1:])[0:-2], format="%Y-%m-%d %H:%M:%S:%f"
                )

            if "Temperature:" in line:
                temp = float(line.split(":")[-1])

            if not fs:
                if "Measurement Frequency:" in line: # leitura da frequencia
                    fs = float(line.split(":")[-1].split(" ")[0])
                    offset = np.array([1 / fs] * 300) * np.arange(0, 300)
                    delta = pd.to_timedelta(offset, unit="s")

            """ 
                Quando uma linha contém 3600 caracteres (indicação de blocos de dados), o método decodifica 
                os valores hexadecimais para binário, e os converte em valores para os eixos X, Y, Z, luminosidade
                e outros.
            """
            
            if len(line) == 3600:
                # hex to bin
                hexes = struct.unpack("12s " * 300, line)
                bins = (
                    struct.unpack(
                        "12s 12s 12s 10s 1s 1s", bin(int(hx, 16))[2:].zfill(48)
                    )
                    for hx in hexes
                )
                decode = pd.DataFrame(
                    bins,
                    columns=["X", "Y", "Z", "LUX", "Button", "_"],
                    index=pd.DatetimeIndex([time] * 300) + delta,
                )

                # binary to decimal and calibration
                decode.X = decode.X.apply(
                    lambda x: round(
                        (BitArray(bin=x).int * 100.0 - x_offset) / x_gain, 4
                    )
                )
                decode.Y = decode.Y.apply(
                    lambda x: round(
                        (BitArray(bin=x).int * 100.0 - y_offset) / y_gain, 4
                    )
                )
                decode.Z = decode.Z.apply(
                    lambda x: round(
                        (BitArray(bin=x).int * 100.0 - z_offset) / z_gain, 4
                    )
                )
                decode.LUX = decode.LUX.apply(lambda x: int(x, 2) * lux / volts)
                decode["T"] = temp
                df.append(decode)

        df = pd.concat(df, axis=0)
        df.index.name = "Time"
        return df[["X", "Y", "Z", "LUX", "T"]]

Implementação da classe ColeKripke para utilização do algoritmo de mesmo nome.

In [4]:
class ColeKripke:
    """
    Runs sleep wake detection on epoch level activity data. Epochs are 1 minute long and activity is represented
    by an activity index.
    """

    def __init__(self, activity_index):
        """
        Initialization of the class

        :param activity_index: pandas dataframe of epoch level activity index values
        """
        self.activity_index = activity_index
        self.predictions = None

    def predict(self, sf=np.array(0.193125)):
        """
        Runs the prediction of sleep wake states based on activity index data.

        :param sf: scale factor to use for the predictions (default corresponds to scale factor optimized for use with
        the activity index, if other activity measures are desired the scale factor can be modified or optimized.)
        The recommended range for the scale factor is between 0.1 and 0.25 depending on the sensitivity to activity
        desired, and possibly the population being observed.

        :return: rescored predictions
        """
        kernel = (
            sf
            * np.array([4.64, 6.87, 3.75, 5.07, 16.19, 5.84, 4.024, 0.00, 0.00])[::-1]
        )
        scores = np.convolve(self.activity_index, kernel, "same")
        scores[scores >= 0.5] = 1
        scores[scores < 0.5] = 0

        # rescore the original predictions
        self.rescore(scores)
        return self.predictions

    def rescore(self, predictions):
        """
        Application of Webster's rescoring rules as described in the Cole-Kripke paper.

        :param predictions: array of predictions
        :return: rescored predictions
        """
        rescored = predictions.copy()
        # rules a through c
        wake_bin = 0
        for t in range(len(rescored)):
            if rescored[t] == 1:
                wake_bin += 1
            else:
                if (
                    14 < wake_bin
                ):  # rule c: at least 15 minutes of wake, next 4 minutes of sleep get rescored
                    rescored[t : t + 4] = 1.0
                elif (
                    9 < wake_bin < 15
                ):  # rule b: at least 10 minutes of wake, next 3 minutes of sleep get rescored
                    rescored[t : t + 3] = 1.0
                elif (
                    3 < wake_bin < 10
                ):  # rule a: at least 4 minutes of wake, next 1 minute of sleep gets rescored
                    rescored[t] = 1.0
                wake_bin = 0
        # rule d: 6 minutes or less of sleep surrounded by at least 10 minutes of wake on each side gets rescored
        sleep_bin = 0
        start_ind = 0
        for t in range(10, len(rescored) - 10):
            if rescored[t] == 0:
                sleep_bin += 1
                if sleep_bin == 1:
                    start_ind = t
            else:
                if 0 < sleep_bin <= 6:
                    if (
                        sum(rescored[start_ind - 10 : start_ind]) == 10.0
                        and sum(rescored[t : t + 10]) == 10.0
                    ):
                        rescored[start_ind:t] = 1.0
                sleep_bin = 0
        self.predictions = rescored

Atribuindo os valores para os atributos da classe SleepPY.


<font color=blue> Descrição das variáveis informadas pelo usuário: </font>
- input_file: caminho completo para o arquivo a ser processado  
            
        Ex: 'BIN/1379-38_left wrist_046456_2018-08-03 10-23-38.csv'
- results_directory = caminho completo para o diretório onde os resultados devem ser salvos
- sampling_frequency = frequência de amostragem em que os dados foram gravados
- start_buffer: número de segundos a serem ignorados no início da gravação 
- stop_buffer: número de segundos a serem ignorados no final da gravação
- start_time: data e hora a partir da qual o processamento dos dados deve começar (string) no formato: "%Y-%m-%d %H:%M:%S:%f"
- stop_time: data e hora em que o processamento dos dados deve parar (string) no formato: "%Y-%m-%d %H:%M:%S:%f"
- run_config: valor inteiro entre 0 e 6, que define qual etapa do processamento será executada 

        - self.run_config <= 0: split the data into 24 hour periods  
        - self.run_config <= 1: extract the activity index feature 
        - self.run_config <= 2: run wear/on-body detection  
        - self.run_config <= 3: run major rest period detection  
        - self.run_config <= 4: run sleep wake predictions on the major rest period  
        - self.run_config <= 5: calculate endpoints based on the above predictions  
        - self.run_config <= 6: generates visual reports
        
- temperature_threshold: temperatura mínima aceitável para considerar um período de repouso como candidato válido
- minimum_rest_block: número mínimo de minutos necessário para considerar um período de repouso válido (inteiro)
- allowed_rest_break: número de minutos permitidos para interromper um período de repouso maior (inteiro)
- minimum_rest_threshold: limite mínimo permitido para determinar um período de repouso principal (valor decimal)
- maximum_rest_threshold: limite máximo permitido para determinar um período de repouso principal (valor decimal)
- minimum_hours: número mínimo de horas necessárias para considerar um dia como utilizável (inteiro)
- clear_intermediate_data: indicador booleano para limpar todos os dados intermediários
- aws_object: objeto de dados a ser processado a partir do AWS (em substituição ao caminho do arquivo de origem)
- verbose: booleano que define se o status deve ser impresso durante o processamento




In [2]:
# Usuário deve informar

folder_path = "C:/Users/marim/Documents/Faculdade/TCC/patient_act_data_CSV"
sleep_sum_path = 'C:/Users/marim/Documents/Faculdade/TCC/HEATMAP/sleep_sum_TOTAL.csv'
folder_name = "patient_act_data_CSV"
key = "Measurement Frequency"
results_directory = 'C:/Users/marim/Documents/Faculdade/TCC/' + folder_name
# run_config

In [None]:
# Automatizando a leitura de arquivos

import csv_format as cs
from pathlib import Path

# Lendo os arquivos CSV 1379-38_left wrist_046456_2018-08-03 10-23-38
# patient_act_data -> pasta com os csv extraidos no app do GENEActiv
# Itera sobre todos os arquivos na pasta


for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):  # Verifica se o arquivo é CSV
        #input_file_path = os.path.join(folder_path, filename)

        input_file_path = Path(folder_path) / filename
        input_file_path = str(input_file_path).replace("\\", "/")

        var = cs.split_100_csv(input_file_path, key)

        if key == "Measurement Frequency": #colocar como opcao?
            var = int(var)
            sampling_frequency = var

        input_file = input_file_path
        #results_directory = 'C:/Users/marim/Documents/Faculdade/TCC/' + folder_path
        #sampling_frequency = 50
        start_buffer="0s"
        stop_buffer="0s"
        start_time=""
        stop_time=""
        run_config=0
        temperature_threshold=25.0
        minimum_rest_block=30
        allowed_rest_break=60
        minimum_rest_threshold=0.0
        maximum_rest_threshold=1000.0
        minimum_hours=6
        clear_intermediate_data=False
        aws_object=None
        verbose=False

        print(input_file_path) # esta lendo certo 
        # aqui pode chamar a funcao sleeppy
        # ver onde os resultados vao ser salvos 
        print(results_directory)

        print(sampling_frequency)
        
        patient = SleepPy(input_file, 
                results_directory, 
                sampling_frequency, 
                start_buffer, 
                stop_buffer, 
                start_time, 
                stop_time, 
                run_config, 
                temperature_threshold, 
                minimum_rest_block, 
                allowed_rest_break, 
                minimum_rest_threshold, 
                maximum_rest_threshold, 
                minimum_hours, 
                clear_intermediate_data, 
                aws_object, 
                verbose
        )
        

In [3]:
import merge_data as mgd

In [4]:
mgd.read_csv_from_subdirs(folder_path, sleep_sum_path)

Lendo arquivo activity: C:/Users/marim/Documents/Faculdade/TCC/patient_act_data_CSV\1379-38_left wrist_046456_2018-08-03 10-23-38\1379-38_activity_index_day.csv
Lendo arquivo activity: C:/Users/marim/Documents/Faculdade/TCC/patient_act_data_CSV\1379-38_left wrist_046456_2018-08-03 10-23-38\1379-38_ck_predictions.csv
Lendo arquivo raw_data: C:/Users/marim/Documents/Faculdade/TCC/patient_act_data_CSV\1379-38_left wrist_046456_2018-08-03 10-23-38\1379-38_raw_data.csv
Lendo arquivo activity: C:/Users/marim/Documents/Faculdade/TCC/patient_act_data_CSV\1379-38_left wrist_046456_2018-08-03 10-23-38\1379-38_wear_detection.csv


  sleep_sum_cut.calendar_date = pd.to_datetime(sleep_sum_cut.calendar_date)
  merged_df['activity_index'] = merged_df['activity_index'].fillna(method='ffill')




Tabela: 1379-38
         Date      Time       X       Y       Z     T  Sleep  activity_index
0  2018-07-27  15:55:55 -0.0359 -0.2505 -1.0169  36.2      0        0.157358
1  2018-07-27  15:55:55 -0.1655 -0.1098 -0.8518  36.2      0        0.157358
2  2018-07-27  15:55:55 -0.1026 -0.1528 -0.9203  36.2      0        0.157358
3  2018-07-27  15:55:55 -0.0084 -0.1958 -0.9806  36.2      0        0.157358
4  2018-07-27  15:55:55 -0.0045 -0.2154 -0.9243  36.2      0        0.157358


In [5]:
data_final = pd.read_csv('C:/Users/marim/Documents/Faculdade/TCC/patient_act_data_CSV/1379-38_data_final.csv')
data_final

Unnamed: 0.1,Unnamed: 0,Date,Time,X,Y,Z,T,Sleep,activity_index
0,0,2018-07-27,15:55:55,-0.0359,-0.2505,-1.0169,36.2,0,0.157358
1,1,2018-07-27,15:55:55,-0.1655,-0.1098,-0.8518,36.2,0,0.157358
2,2,2018-07-27,15:55:55,-0.1026,-0.1528,-0.9203,36.2,0,0.157358
3,3,2018-07-27,15:55:55,-0.0084,-0.1958,-0.9806,36.2,0,0.157358
4,4,2018-07-27,15:55:55,-0.0045,-0.2154,-0.9243,36.2,0,0.157358
...,...,...,...,...,...,...,...,...,...
29242795,29242795,2018-08-03,10:23:30,-0.0869,0.3985,-0.8478,22.9,0,0.120275
29242796,29242796,2018-08-03,10:23:30,-0.0673,0.3398,-0.8679,22.9,0,0.120275
29242797,29242797,2018-08-03,10:23:30,-0.0595,0.3516,-0.9122,22.9,0,0.120275
29242798,29242798,2018-08-03,10:23:30,-0.0477,0.4415,-0.9525,22.9,0,0.120275


## Testes

Instância de um objeto da classe SleepPy para teste do pacote.

Tempo aproximado de execução: 15 min

Aqui está rodando normalmente: 01/10/24 - 17 min

In [17]:
# Input de dados para leitura de um arquivo csv referente a um unico relogio de uma unica paciente

input_file = 'patient_act_data_CSV/1839-39_left wrist_046896_2020-12-18 19-35-29.csv'
results_directory = 'C:/Users/marim/Documents/Faculdade/TCC/patient_act_data_CSV'
sampling_frequency = 50
start_buffer="0s"
stop_buffer="0s"
start_time=""
stop_time=""
run_config=0
temperature_threshold=25.0
minimum_rest_block=30
allowed_rest_break=60
minimum_rest_threshold=0.0
maximum_rest_threshold=1000.0
minimum_hours=6
clear_intermediate_data=False
aws_object=None
verbose=False

In [None]:
teste = SleepPy(input_file, 
                results_directory, 
                sampling_frequency, 
                start_buffer, 
                stop_buffer, 
                start_time, 
                stop_time, 
                run_config, 
                temperature_threshold, 
                minimum_rest_block, 
                allowed_rest_break, 
                minimum_rest_threshold, 
                maximum_rest_threshold, 
                minimum_hours, 
                clear_intermediate_data, 
                aws_object, 
                verbose
)

## Merge data - Excluir

In [None]:
for dirpath, dirnames, filenames in os.walk(folder_path):
    for filename in filenames:

        if filename.endswith('.csv') and 'raw_data' in filename:
            # Monta o caminho completo do arquivo
            csv_file_path = os.path.join(dirpath, filename)
            
            df_raw = pd.read_csv(csv_file_path, nrows=100)
            
            print(f"Arquivo CSV lido: {filename}")

            a = os.path.splitext(os.path.basename(csv_file_path))
            patient_id_csv = a[0].split('_')
            patient_id_csv[0]

        elif filename.endswith('.csv') and 'activity_index' in filename:
            # Monta o caminho completo do arquivo
            csv_file_path = os.path.join(dirpath, filename)
            
            df_activity_index = pd.read_csv(csv_file_path, nrows=100)
            
            print(f"Arquivo CSV lido: {filename}")
        
        elif filename.endswith('.csv') and 'wear_detection' in filename:
            # Monta o caminho completo do arquivo
            csv_file_path = os.path.join(dirpath, filename)
            
            df_wear_detection = pd.read_csv(csv_file_path, nrows=100)
            
            print(f"Arquivo CSV lido: {filename}")
        
        elif filename.endswith('.csv') and 'ck_predictions' in filename:
            # Monta o caminho completo do arquivo
            csv_file_path = os.path.join(dirpath, filename)
            
            df_ck_predictions = pd.read_csv(csv_file_path, nrows=100)
            
            print(f"Arquivo CSV lido: {filename}")
    

df_raw_f, df_activity_index_f, df_wear_detection_f, df_ck_predictions_f = mgd.df_format(df_raw, df_activity_index, df_wear_detection, df_ck_predictions)


In [None]:
df_ck_predictions = df_ck_predictions.rename(columns={'Time': 'Datetime'})
df_ck_predictions["Datetime"] = pd.to_datetime(df_ck_predictions["Datetime"])

# Extraindo a data e formatando o horário até os segundos
df_ck_predictions['Date'] = df_ck_predictions["Datetime"].dt.date
df_ck_predictions['Time'] = df_ck_predictions["Datetime"].dt.strftime('%H:%M:%S')

#df_activity_index = df_activity_index.drop(columns=["Time"])
df_ck_predictions = df_ck_predictions[['Date', 'Time', 'sleep_predictions']]

df_ck_predictions.head()

In [None]:
df_activity_index = df_activity_index.rename(columns={'Time': 'Datetime'})
df_activity_index["Datetime"] = pd.to_datetime(df_activity_index["Datetime"])

# Extraindo a data e formatando o horário até os segundos
df_activity_index['Date'] = df_activity_index["Datetime"].dt.date
df_activity_index['Time'] = df_activity_index["Datetime"].dt.strftime('%H:%M:%S')

#df_activity_index = df_activity_index.drop(columns=["Time"])
df_activity_index = df_activity_index[['Date', 'Time', 'activity_index']]

df_activity_index.head()

In [None]:
df_wear_detection.head()

In [None]:
sleep_sum_cut = sleep_sum[['ID', 'calendar_date', 'sleeponset_ts', 'wakeup_ts']]
sleep_sum_cut.calendar_date = pd.to_datetime(sleep_sum_cut.calendar_date)
sleep_sum_cut.head()

In [None]:
df_raw["Unnamed: 0"] = pd.to_datetime(df_raw["Unnamed: 0"])

# Extraindo a data e formatando o horário até os segundos
df_raw['Date'] = df_raw["Unnamed: 0"].dt.date
df_raw['Time'] = df_raw["Unnamed: 0"].dt.strftime('%H:%M:%S')

df_raw = df_raw.drop(columns=["Unnamed: 0"])
df_raw = df_raw[['Date', 'Time', 'X', 'Y', 'Z', 'T', 'Sleep']]
df_raw.head()

In [None]:
patient_sleep_sum = sleep_sum_cut[sleep_sum_cut['ID'] == patient_id_csv[0]] # deve procurar por todas as pacientes
patient_sleep_sum

In [None]:
# Iterando sobre cada linha da tabela result
for _, row in patient_sleep_sum.iterrows():
    calendar_date = row['calendar_date']
    sleeponset_ts = row['sleeponset_ts']
    wakeup_ts = row['wakeup_ts']
    
    # Filtrando a segunda tabela pela data correspondente
    mask = df_raw['Date'] == calendar_date
    date_filtered_df = df_raw[mask]

    # Encontrar o indice 
    sleeponset_idx = date_filtered_df[date_filtered_df['Time'] == sleeponset_ts].index
    wakeup_idx = date_filtered_df[date_filtered_df['Time'] == wakeup_ts].index

    if not sleeponset_idx.empty and not wakeup_idx.empty:
        df_raw.loc[sleeponset_idx[0]:wakeup_idx[0], 'Sleep'] = 1

# Exibindo o resultado final
print(df_raw)


In [None]:
merged_df = pd.merge(df_raw, df_activity_index, how='left', on=['Date', 'Time'])

# Propagar os valores de activity_index para frente (até o próximo minuto)
merged_df['activity_index'] = merged_df['activity_index'].fillna(method='ffill')

merged_df