# Imports and config

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
DATA_FOLDER = "data"
FOLDERS = ["folder_1", "folder_2"]
SEL_FOLDER = FOLDERS[0]

# Data import

In [3]:
files = [f.split(".")[0] for f in os.listdir(f"{DATA_FOLDER}/{SEL_FOLDER}")]
files

['1_1', '1_2', '2_1', '2_2', '3_1', '3_2', '3_3', 'labels']

In [4]:
LABEL_FILENAME = "labels"
labels = pd.read_csv(f"{DATA_FOLDER}/{SEL_FOLDER}/{LABEL_FILENAME}.csv", index_col=0)
train_files = [f for f in files if f != LABEL_FILENAME]

In [20]:
refs = {}
anos = {}

for file in train_files:
    train_file = pd.read_csv(f"{DATA_FOLDER}/{SEL_FOLDER}/{file}.csv", index_col=0)
    train_file["original_file"] = file
    train_file["timestamp"] = train_file.index
    label_file = labels.loc[labels["trace_id"]==file, :]
    for i in label_file.index:
        ano_id =  label_file.loc[i, "ano_id"]
        selection_ref = train_file.loc[(train_file["timestamp"] >= label_file["ref_start"][i]) & (train_file["timestamp"] < label_file["ref_end"][i]), :].copy()
        selection_ref["ano_id"] = ano_id
        selection_ref["type_data"] = "ref"
        selection_ano = train_file.loc[(train_file["timestamp"] >= label_file["ano_start"][i]) & (train_file["timestamp"] <= label_file["ano_end"][i]), :].copy()
        selection_ano["ano_id"] = ano_id
        selection_ano["type_data"] = "ano"
        refs[f"{file}_{i}"] = selection_ref
        anos[f"{file}_{i}"] = selection_ano

assert refs.keys() == anos.keys()
refs = pd.concat(refs).droplevel(1)
anos = pd.concat(anos).droplevel(1)
        

In [6]:
refs.shape, anos.shape

((56766, 15), (28411, 15))

In [16]:
def class_entropy(ts_a:list, ts_r:list) -> float:
    """Calculate the class entropy of a feature, which is the information
    needed to describe the class distributions between two time serie.

    Parameters
    ----------
    tsa : list
        A time series belonging to the abnormal class.
    tsr : list
        A time series belong to the reference class.

    Returns
    -------
    float
        The class entropy.
    """
    nb_ts_a = len(ts_a)
    nb_ts_r = len(ts_r)
    if nb_ts_a == 0 or nb_ts_r == 0:
        raise ValueError(f"One of the time series is empty. Len of TSA is {nb_ts_a} and len of TSR is {nb_ts_r}.")
    p_a = nb_ts_a / (nb_ts_a + nb_ts_r)
    p_r = nb_ts_r / (nb_ts_a + nb_ts_r)
    h_class = p_a * np.log2(p_a) + p_r * np.log2(p_r)
    return h_class

In [None]:
def segmentation_entropy(data:pd.DataFrame) -> float:
    


In [24]:
for file in train_files:
    ano_ids = list(labels.loc[labels["trace_id"]==file, "ano_id"])
    for ano_id in ano_ids:
        selected_ref = refs.loc[(refs["ano_id"]==ano_id) & (refs["original_file"]==file), :]
        selected_ano = anos.loc[(anos["ano_id"]==ano_id) & (anos["original_file"]==file), :]
        class_ent = class_entropy(selected_ref[feature], selected_ano[feature])
        for feature in selected_ref.columns[:-3]:
            all_values = pd.concat([selected_ref[[feature, "type_data"]], selected_ano[[feature, "type_data"]]])
            sorted_values = all_values.sort_values(by=feature)
            # ici, calculer l'entropie de segmentation
            break
        

In [27]:
list(sorted_values["type_data"])

['ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
 'ref',
