Notebook corresponding to the "Approach-1" presented in the paper.

This is the same approach used in the ["Tinyml anomaly detection for industrial machines with periodic duty cycles" (Sensor Application Symposium 2024)](https://ieeexplore.ieee.org/abstract/document/10636584/), and serves as the baseline experiment.

Two experiments are carried on:
1) As in the SAS2024, the performance is evaluated in leave-one-month-out CV in the original 4 months (called DS1).
2) The generalization is evaluated using the whole DS1 for training and the whole DS2 for testing.

In [None]:
from custom_functions import *

import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# import data, extract feature and preprocessing

different functions used in the notebook

input data

In [None]:
directory="../../../data/"
#first 4 months of data (DS1)
data_csv_jun21 = read_month_data(directory+'Confidential_Drive_data_Jun2021.csv',1)
data_csv_okt21 = read_month_data(directory+'Confidential_Drive_data_Okt2021.csv',1)
data_csv_jan22 = read_month_data(directory+'Confidential_Drive_data_Jan2022.csv',1)
data_csv_april22 = read_month_data(directory+'Confidential_Drive_data_April2022.csv',1)
#new 4 months (DS2)
data_csv_jun23 = read_month_data(directory+'Confidential_Drive_data_June2023_Drift20.csv')
data_csv_aug23 = read_month_data(directory+'Confidential_Drive_data_Aug2023_Drift20.csv')
data_csv_okt23 = read_month_data(directory+'Confidential_Drive_data_Oct2023_Drift20.csv')
data_csv_dec23 = read_month_data(directory+'Confidential_Drive_data_Dec2023_Drift20.csv')

#re-order the column name to be consistent with the previous csv files
desired_order=["High-pressure","Low-pressure","Speed"]
data_csv_jun23=data_csv_jun23[desired_order]
data_csv_aug23=data_csv_aug23[desired_order]
data_csv_okt23=data_csv_okt23[desired_order]
data_csv_dec23=data_csv_dec23[desired_order]

#These data has duplicated entries
data_csv_okt23 = data_csv_okt23[~data_csv_okt23.index.duplicated(keep='first')]

# round to zero speed less than zero
data_csv_jun21.loc[data_csv_jun21['Speed'] < 0 , 'Speed'] = 0
data_csv_okt21.loc[data_csv_okt21['Speed'] < 0 , 'Speed'] = 0
data_csv_jan22.loc[data_csv_jan22['Speed'] < 0 , 'Speed'] = 0
data_csv_april22.loc[data_csv_april22['Speed'] < 0 , 'Speed'] = 0
data_csv_jun23.loc[data_csv_jun23['Speed'] < 0 , 'Speed'] = 0
data_csv_aug23.loc[data_csv_aug23['Speed'] < 0 , 'Speed'] = 0
data_csv_okt23.loc[data_csv_okt23['Speed'] < 0 , 'Speed'] = 0
data_csv_dec23.loc[data_csv_dec23['Speed'] < 0 , 'Speed'] = 0


# complete the dataset with missing values
full_timestamp = pd.date_range(start = data_csv_jun21.index[0], end = data_csv_jun21.index[-1],inclusive="both",freq="1min" )
data_csv_jun21 = data_csv_jun21.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_okt21.index[0], end = data_csv_okt21.index[-1],inclusive="both",freq="1min" )
data_csv_okt21 = data_csv_okt21.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_jan22.index[0], end = data_csv_jan22.index[-1],inclusive="both",freq="1min" )
data_csv_jan22 = data_csv_jan22.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_april22.index[0], end = data_csv_april22.index[-1],inclusive="both",freq="1min" )
data_csv_april22 = data_csv_april22.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_jun23.index[0], end = data_csv_jun23.index[-1],inclusive="both",freq="1min" )
data_csv_jun23 = data_csv_jun23.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_aug23.index[0], end = data_csv_aug23.index[-1],inclusive="both",freq="1min" )
data_csv_aug23 = data_csv_aug23.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_okt23.index[0], end = data_csv_okt23.index[-1],inclusive="both",freq="1min" )
data_csv_okt23 = data_csv_okt23.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_dec23.index[0], end = data_csv_dec23.index[-1],inclusive="both",freq="1min" )
data_csv_dec23 = data_csv_dec23.reindex(full_timestamp)



#use linear interpolation for the NaN missing values
interpolate_values(data_csv_jun21)
interpolate_values(data_csv_okt21)
interpolate_values(data_csv_jan22)
interpolate_values(data_csv_april22)
interpolate_values(data_csv_jun23)
interpolate_values(data_csv_aug23)
interpolate_values(data_csv_okt23)
interpolate_values(data_csv_dec23)

del desired_order, directory, full_timestamp

compute features

In [None]:
list_data_csv = [data_csv_jun21,data_csv_okt21,data_csv_jan22,data_csv_april22,data_csv_jun23,data_csv_aug23,data_csv_okt23,data_csv_dec23]
for data in list_data_csv:
    extract_features(data)

ground truth reference

In [None]:
#read files from imagimob
directory="../../data/"
column_interest=['Time(Seconds)' , 'Length(Seconds)',"Label(string)"]

#read labels of states
file_imagimob_1 = pd.read_csv(directory+"April_2022/Label.label",usecols=column_interest)
file_imagimob_2 = pd.read_csv(directory+"Jan_2022/Label.label",usecols=column_interest)
file_imagimob_3 = pd.read_csv(directory+"Jun_2021/Label.label",usecols=column_interest)
file_imagimob_4 = pd.read_csv(directory+"Okt_2021/Label.label",usecols=column_interest)

timestamps_april2022 = df_timestamps(file_imagimob_1)
timestamps_jan2022 = df_timestamps(file_imagimob_2)
timestamps_jun2021 = df_timestamps(file_imagimob_3)
timestamps_okt2021 = df_timestamps(file_imagimob_4)

#read labels of duty-cycle
file_imagimob_1 = pd.read_csv(directory+"April_2022/Label_cycle.label",usecols=column_interest)
file_imagimob_2 = pd.read_csv(directory+"Jan_2022/Label_cycle.label",usecols=column_interest)
file_imagimob_3 = pd.read_csv(directory+"Jun_2021/Label_cycle.label",usecols=column_interest)
file_imagimob_4 = pd.read_csv(directory+"Okt_2021/Label_cycle.label",usecols=column_interest)

timestamps_cycle_april2022 = df_timestamps(file_imagimob_1)
timestamps_cycle_jan2022 = df_timestamps(file_imagimob_2)
timestamps_cycle_jun2021 = df_timestamps(file_imagimob_3)
timestamps_cycle_okt2021 = df_timestamps(file_imagimob_4)

In [None]:
#generate vector with the labels of reference (states)
downsampled_freq='1T'
true_label_april22 = ndarray_labels(datetime.datetime(2022, 4, 1),datetime.datetime(2022, 5, 1),timestamps_april2022,downsampled_freq)
true_label_jan22 = ndarray_labels(datetime.datetime(2021, 12, 21),datetime.datetime(2022, 1, 21),timestamps_jan2022,downsampled_freq)
true_label_jun21 = ndarray_labels(datetime.datetime(2021, 6, 1),datetime.datetime(2021, 7, 1),timestamps_jun2021,downsampled_freq)
true_label_okt21 = ndarray_labels(datetime.datetime(2021, 10, 1),datetime.datetime(2021, 11, 1),timestamps_okt2021,downsampled_freq)

#generate vector with the labels of reference (duty-cycle)
true_label_cycle_april22 = ndarray_labels(datetime.datetime(2022, 4, 1),datetime.datetime(2022, 5, 1),timestamps_cycle_april2022,downsampled_freq)
true_label_cycle_jan22 = ndarray_labels(datetime.datetime(2021, 12, 21),datetime.datetime(2022, 1, 21),timestamps_cycle_jan2022,downsampled_freq)
true_label_cycle_jun21 = ndarray_labels(datetime.datetime(2021, 6, 1),datetime.datetime(2021, 7, 1),timestamps_cycle_jun2021,downsampled_freq)
true_label_cycle_okt21 = ndarray_labels(datetime.datetime(2021, 10, 1),datetime.datetime(2021, 11, 1),timestamps_cycle_okt2021,downsampled_freq)

true_label_cycle_april22 = np.where(true_label_cycle_april22 == None, 'No_cycle', true_label_cycle_april22)
true_label_cycle_jan22 = np.where(true_label_cycle_jan22 == None, 'No_cycle', true_label_cycle_jan22)
true_label_cycle_jun21 = np.where(true_label_cycle_jun21 == None, 'No_cycle', true_label_cycle_jun21)
true_label_cycle_okt21 = np.where(true_label_cycle_okt21 == None, 'No_cycle', true_label_cycle_okt21)

imput ground-truth duty-cycle labels

In [None]:
#read files from imagimob
directory="../../data/"
#read labels of duty-cycle
labels_jun21 = import_cycle_labels(directory+"Jun_2021/Label_cycle.label")
labels_okt21 = import_cycle_labels(directory+"Okt_2021/Label_cycle.label")
labels_jan22 = import_cycle_labels(directory+"Jan_2022/Label_cycle.label")
labels_april22 = import_cycle_labels(directory+"April_2022/Label_cycle.label")
labels_jun23 = import_cycle_labels(directory+"June_23/Label_cycle.label")
labels_aug23 = import_cycle_labels(directory+"Aug_23/Label_cycle.label")
labels_okt23 = import_cycle_labels(directory+"Okt_23/Label_cycle.label")
labels_dec23 = import_cycle_labels(directory+"Dec_23/Label_cycle.label")


for data in [labels_jun23,labels_aug23,labels_okt23,labels_dec23]:
    replace_labels_cycles(data)

 Data preparation and pre-processing

In [None]:
true_label_jun21 [true_label_jun21=='E']='B'
true_label_okt21 [true_label_okt21=='E']='B'
true_label_jan22 [true_label_jan22=='E']='B'
true_label_april22 [true_label_april22=='E']='B'

data_DS1=[data_csv_jun21, data_csv_okt21,data_csv_jan22,data_csv_april22]
data_DS2=[data_csv_jun23, data_csv_aug23,data_csv_okt23,data_csv_dec23]

true_state_labels_DS1=[true_label_jun21, true_label_okt21,true_label_jan22,true_label_april22]
df_testset_DS2= pd.concat(data_DS2)

file_name_states_DS1= ["jun2021_state.txt" ,"okt2021_state.txt","jan2022_state.txt","april2022_state.txt"]
file_name_cycles_DS1= ["jun2021_cycle.txt" ,"okt2021_cycle.txt","jan2022_cycle.txt","april2022_cycle.txt"]

scaler = MinMaxScaler()

dir_exp1 = "./results/approach1/DS1/"
dir_exp2 = "./results/approach1/DS2/"

flag_save_results=True

delete not requires variables

In [None]:
del timestamps_april2022, timestamps_jan2022, timestamps_jun2021, timestamps_okt2021
del file_imagimob_1,file_imagimob_2,file_imagimob_3,file_imagimob_4, column_interest, directory

# Experiment 1
Train/test on DS1 using leave-one-month CV

In [None]:
seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]

for i in range(len(data_DS1)):
    # train_set
    df_dataset = pd.concat([data for j, data in enumerate(data_DS1) if j != i])
    df_dataset["ref_label"]= np.concatenate([data for j, data in enumerate(true_state_labels_DS1) if j != i])

    removed_indices = df_dataset[df_dataset['ref_label'].isnull()].index.tolist()
    df_dataset = df_dataset[df_dataset['ref_label'].notnull()]
    df_dataset=df_dataset.reset_index()

    x_train = df_dataset[df_dataset.columns[1:-1]]
    y_train = df_dataset[df_dataset.columns[-1]]
    x_train_balanced, y_train_balanced, le = balance_dataset(x_train,y_train)
    x_train_balanced=pd.DataFrame(scaler.fit_transform(x_train_balanced), columns=x_train.columns)
    
    #test_set
    x_test=pd.DataFrame(scaler.fit_transform(data_DS1[i]), columns=x_train.columns)

    for seed in seeds:
        for classifier in classifiers:
            #train/test states
            clf = train_state_supervised_classifier(classifier,x_train_balanced, y_train_balanced,seed)
            y_predict = clf.predict(x_test)

            #apply 3rd median filter
            y_pred_smoothed = smooth_labels(y_predict,3)
            
            y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
            df_temp=data_DS1[i].copy()
            df_temp["recognized_label"]=y_recognized

            # df_temp=df_testset.copy()
            df_temp=df_temp.reset_index()

            #classify duty-cycle
            df_recognized_states = create_segments_state(data_DS1[i].index[0],data_DS1[i].index[-1],df_temp)
            df_recognized_cycles = create_segments_cycles(df_recognized_states)

            # save the results in files
            if flag_save_results:
                folder_path = dir_exp1+classifier+"/"+str(seed)+"/"
                os.makedirs(folder_path, exist_ok=True)

                create_reference_label_file(folder_path+file_name_states_DS1[i],df_recognized_states)
                create_reference_label_file(folder_path+file_name_cycles_DS1[i],df_recognized_cycles)

In [None]:
seed=0
classifiers=["xgboost","nb"]

for i in range(len(data_DS1)):
    # train_set
    df_dataset = pd.concat([data for j, data in enumerate(data_DS1) if j != i])
    df_dataset["ref_label"]= np.concatenate([data for j, data in enumerate(true_state_labels_DS1) if j != i])

    removed_indices = df_dataset[df_dataset['ref_label'].isnull()].index.tolist()
    df_dataset = df_dataset[df_dataset['ref_label'].notnull()]
    df_dataset=df_dataset.reset_index()

    x_train = df_dataset[df_dataset.columns[1:-1]]
    y_train = df_dataset[df_dataset.columns[-1]]
    x_train_balanced, y_train_balanced,le = balance_dataset(x_train,y_train)
    x_train_balanced=pd.DataFrame(scaler.fit_transform(x_train_balanced), columns=x_train.columns)
    
    #test_set
    x_test=pd.DataFrame(scaler.fit_transform(data_DS1[i]), columns=x_train.columns)

    for classifier in classifiers:
        #train/test states
        clf = train_state_supervised_classifier(classifier,x_train_balanced, y_train_balanced,seed)
        y_predict = clf.predict(x_test)

        #apply 3rd median filter
        y_pred_smoothed = smooth_labels(y_predict,3)
        
        y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
        df_temp=data_DS1[i].copy()
        df_temp["recognized_label"]=y_recognized

        # df_temp=df_testset.copy()
        df_temp=df_temp.reset_index()

        #classify duty-cycle
        df_recognized_states = create_segments_state(data_DS1[i].index[0],data_DS1[i].index[-1],df_temp)
        df_recognized_cycles = create_segments_cycles(df_recognized_states)

        # save the results in files
        if flag_save_results:
            folder_path = dir_exp1+classifier+"/"
            os.makedirs(folder_path, exist_ok=True)

            create_reference_label_file(folder_path+file_name_states_DS1[i],df_recognized_states)
            create_reference_label_file(folder_path+file_name_cycles_DS1[i],df_recognized_cycles)

## performance

In [None]:
df_results_exp1 = pd.DataFrame(columns=['state_classifier', "detection mean",'detection std',
                                        'Abnormal mean F1-score','Abnormal std F1-score',
                                        'Normal mean F1-score','Normal std F1-score',
                                        'Overall mean F1-score','Overall std F1-score'])

seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]

dir = './results/reference_cycle_labels/'
reference_path = dir
collar = 202.75

detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

for classifier_state in classifiers:
    detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
    detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
    detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

    for seed_cycle in seeds:
        result_path = dir_exp1+classifier_state+"/"+str(seed_cycle)+"/"
        f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
        detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)
        detection_files_overall.append(detection_file_overall)
        f1score_files_overall.append(f1score_file)
        precision_files_overall.append(precision_file)
        recall_files_overall.append(recall_file)
        f1score_files_abnormal.append(f1score_abnormal)
        precision_files_abnormal.append(precision_abnormal)
        recall_files_abnormal.append(recall_abnormal)
        f1score_files_normal.append(f1score_normal)
        precision_files_normal.append(precision_normal)
        recall_files_normal.append(recall_normal)

    print("---------- "+classifier_state+" ----------")
    print("DETECTION: "+ str(np.mean(detection_files_overall)*100) +" - "+ str(np.std(detection_files_overall)*100) )
    print("ABNORMAL:")
    print("F1-score: "+ str(np.mean(f1score_files_abnormal)*100) +" - "+ str(np.std(f1score_files_abnormal)*100) )
    print("Precision: "+ str(np.mean(precision_files_abnormal)*100) +" - "+ str(np.std(precision_files_abnormal)*100) )
    print("Recall: "+ str(np.mean(recall_files_abnormal)*100) +" - "+ str(np.std(recall_files_abnormal)*100) )
    print("NORMAL:")
    print("F1-score: "+ str(np.mean(f1score_files_normal)*100) +" - "+ str(np.std(f1score_files_normal)*100) )
    print("Precision: "+ str(np.mean(precision_files_normal)*100) +" - "+ str(np.std(precision_files_normal)*100) )
    print("Recall: "+ str(np.mean(recall_files_normal)*100) +" - "+ str(np.std(recall_files_normal)*100) )
    print("OVERALL:")
    print("F1-score: "+ str(np.mean(f1score_files_overall)*100) +" - "+ str(np.std(f1score_files_overall)*100) )
    print("Precision: "+ str(np.mean(precision_files_overall)*100) +" - "+ str(np.std(precision_files_overall)*100) )
    print("Recall: "+ str(np.mean(recall_files_overall)*100) +" - "+ str(np.std(recall_files_overall)*100) )

    dflocal = pd.DataFrame({'state_classifier':classifier_state, 
                            "detection mean": np.mean(detection_files_overall)*100,
                            'detection std': np.std(detection_files_overall)*100,
                            'Abnormal mean F1-score':np.mean(f1score_files_abnormal)*100,
                            'Normal mean F1-score':np.mean(f1score_files_normal)*100,
                            'Overall mean F1-score':np.mean(f1score_files_overall)*100,
                            'Abnormal std F1-score':np.std(f1score_files_abnormal)*100,
                            'Normal std F1-score':np.std(f1score_files_normal)*100,
                            'Overall std F1-score':np.std(f1score_files_overall)*100},index=[0])
        
    df_results_exp1 = pd.concat([df_results_exp1, dflocal], ignore_index=True)

classifiers=["xgboost","nb"]

detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

for classifier_state in classifiers:
    result_path = dir_exp1+classifier_state+"/"
    f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
    detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)

    print("---------- "+classifier_state+" ----------")
    print("DETECTION: "+ str(detection_file_overall*100) )
    print("ABNORMAL:")
    print("F1-score: "+ str(f1score_abnormal*100) )
    print("Precision: "+ str(precision_abnormal*100) )
    print("Recall: "+ str(recall_abnormal*100) )
    print("NORMAL:")
    print("F1-score: "+ str(f1score_normal*100) )
    print("Precision: "+ str(precision_normal*100) )
    print("Recall: "+ str(recall_normal*100) )
    print("OVERALL:")
    print("F1-score: "+ str(f1score_file*100)  )
    print("Precision: "+ str(precision_file*100) )
    print("Recall: "+ str(recall_file*100) )

    dflocal = pd.DataFrame({'state_classifier':classifier_state,
                            "detection mean": np.mean(detection_file_overall)*100, 
                            'Abnormal mean F1-score':np.mean(f1score_abnormal)*100,
                            'Normal mean F1-score':np.mean(f1score_normal)*100,
                            'Overall mean F1-score':np.mean(f1score_file)*100},index=[0])
        
    df_results_exp1 = pd.concat([df_results_exp1, dflocal], ignore_index=True)

df_results_exp1.to_csv(dir_exp1 + 'experiment1_results.csv',index=False)

# Experiment 2
Train in DS1 and test in DS2

Data preparation and pre-processing

In [None]:
df_dataset= pd.concat([data_csv_jun21, data_csv_okt21,data_csv_jan22,data_csv_april22])
df_dataset["ref_label"]= np.concatenate((true_label_jun21, true_label_okt21,true_label_jan22,true_label_april22), axis=0)    
df_dataset["ref_label_cycle"]= np.concatenate((true_label_cycle_jun21, true_label_cycle_okt21,true_label_cycle_jan22,true_label_cycle_april22), axis=0)    

removed_indices = df_dataset[df_dataset['ref_label'].isnull()].index.tolist()
df_dataset = df_dataset[df_dataset['ref_label'].notnull()]
df_dataset=df_dataset.reset_index()

# remove the recognized_label column added in the experiment1
if 'recognized_label' in df_dataset.columns:
    df_dataset = df_dataset.drop('recognized_label', axis=1)

x = df_dataset[df_dataset.columns[1:-2]]
y_cycle = df_dataset[df_dataset.columns[-1]]
y_state = df_dataset[df_dataset.columns[-2]]

# normalize feature to range [0;1]
scaler = MinMaxScaler()
scaler.fit(x,4)
x = pd.DataFrame(scaler.transform(x), columns=x.columns)


y_state [y_state=='E']='B'

flag_save_results=True

balance ds1 for training

In [None]:
x_train_balanced, y_train_balanced,le = balance_dataset(x,y_state)

# Print balanced dataset
unique_values, counts = np.unique(y_state, return_counts=True)
value_counts = dict(zip(unique_values, counts))
value_porcentages = dict(zip(unique_values, counts/sum(counts)*100))
print("Value class-counts in Unbalanced dataset:",value_counts)
print("Value class-porcentage in Unbalanced dataset:",value_porcentages)

unique_values, counts = np.unique(y_train_balanced, return_counts=True)
value_counts = dict(zip(unique_values, counts))
value_porcentages = dict(zip(unique_values, counts/sum(counts)*100))
print("Value class-counts in Balanced dataset:",value_counts)
print("Value class-porcentage in Balanced dataset:",value_porcentages)

del unique_values,counts,value_counts,value_porcentages

In [None]:
#test data (DS2)
df_testset= pd.concat([data_csv_jun23, data_csv_aug23,data_csv_okt23,data_csv_dec23])
x_test = pd.DataFrame(scaler.transform(df_testset), columns=df_testset.columns)

In [None]:
seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]
for seed in seeds:
    for classifier in classifiers:
        #train
        clf = train_state_supervised_classifier(classifier,x_train_balanced, y_train_balanced,seed)

        #test state-cycles
        y_predict = clf.predict(x_test)

        #apply 3rd median filter
        y_pred_smoothed = smooth_labels(y_predict,3)
        
        y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
        df_testset["recognized_label"]=y_recognized

        df_temp=df_testset.copy()
        df_temp=df_temp.reset_index()

        #classify duty-cycle
        df_recognized_states_jun23 = create_segments_state(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_temp)
        df_recognized_states_aug23 = create_segments_state(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_temp)
        df_recognized_states_okt23 = create_segments_state(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_temp)
        df_recognized_states_dec23 = create_segments_state(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_temp)

        df_recognized_cycles_jun23 = create_segments_cycles(df_recognized_states_jun23)
        df_recognized_cycles_aug23 = create_segments_cycles(df_recognized_states_aug23)
        df_recognized_cycles_okt23 = create_segments_cycles(df_recognized_states_okt23)
        df_recognized_cycles_dec23 = create_segments_cycles(df_recognized_states_dec23)

        # save the results in files
        if flag_save_results:
            folder_path = dir_exp2+classifier+"/"+str(seed)
            os.makedirs(folder_path, exist_ok=True)

            create_reference_label_file(folder_path+"/jun23_state.txt",df_recognized_states_jun23)
            create_reference_label_file(folder_path+"/aug23_state.txt",df_recognized_states_aug23)
            create_reference_label_file(folder_path+"/okt23_state.txt",df_recognized_states_okt23)
            create_reference_label_file(folder_path+"/dec23_state.txt",df_recognized_states_dec23)

            create_reference_label_file(folder_path+"/jun23_cycle.txt",df_recognized_cycles_jun23)
            create_reference_label_file(folder_path+"/aug23_cycle.txt",df_recognized_cycles_aug23)
            create_reference_label_file(folder_path+"/okt23_cycle.txt",df_recognized_cycles_okt23)
            create_reference_label_file(folder_path+"/dec23_cycle.txt",df_recognized_cycles_dec23)

In [None]:
classifiers=["nb","xgboost"]
for classifier in classifiers:
    #train
    clf = train_state_supervised_classifier(classifier,x_train_balanced, y_train_balanced,seed)

    #test state-cycles
    y_predict = clf.predict(x_test)

    #apply 3rd median filter
    y_pred_smoothed = smooth_labels(y_predict,3)
    
    y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
    df_testset["recognized_label"]=y_recognized

    df_temp=df_testset.copy()
    df_temp=df_temp.reset_index()

    #classify duty-cycle
    df_recognized_states_jun23 = create_segments_state(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_temp)
    df_recognized_states_aug23 = create_segments_state(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_temp)
    df_recognized_states_okt23 = create_segments_state(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_temp)
    df_recognized_states_dec23 = create_segments_state(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_temp)

    df_recognized_cycles_jun23 = create_segments_cycles(df_recognized_states_jun23)
    df_recognized_cycles_aug23 = create_segments_cycles(df_recognized_states_aug23)
    df_recognized_cycles_okt23 = create_segments_cycles(df_recognized_states_okt23)
    df_recognized_cycles_dec23 = create_segments_cycles(df_recognized_states_dec23)

    # save the results in files
    if flag_save_results:
        folder_path = dir_exp2+classifier
        os.makedirs(folder_path, exist_ok=True)

        create_reference_label_file(folder_path+"/jun23_state.txt",df_recognized_states_jun23)
        create_reference_label_file(folder_path+"/aug23_state.txt",df_recognized_states_aug23)
        create_reference_label_file(folder_path+"/okt23_state.txt",df_recognized_states_okt23)
        create_reference_label_file(folder_path+"/dec23_state.txt",df_recognized_states_dec23)

        create_reference_label_file(folder_path+"/jun23_cycle.txt",df_recognized_cycles_jun23)
        create_reference_label_file(folder_path+"/aug23_cycle.txt",df_recognized_cycles_aug23)
        create_reference_label_file(folder_path+"/okt23_cycle.txt",df_recognized_cycles_okt23)
        create_reference_label_file(folder_path+"/dec23_cycle.txt",df_recognized_cycles_dec23)

## performance

In [None]:
df_results_exp2 = pd.DataFrame(columns=['state_classifier', "detection mean",'detection std',
                                        'Abnormal mean F1-score','Abnormal std F1-score',
                                        'Normal mean F1-score','Normal std F1-score',
                                        'Overall mean F1-score','Overall std F1-score'])

seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]

dir = './results/reference_cycle_labels/'
reference_path = dir
collar = 202.75

detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

for classifier_state in classifiers:
    detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
    detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
    detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

    for seed_cycle in seeds:
        result_path = dir_exp2+classifier_state+"/"+str(seed_cycle)+"/"
        f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
        detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)
        detection_files_overall.append(detection_file_overall)
        f1score_files_overall.append(f1score_file)
        precision_files_overall.append(precision_file)
        recall_files_overall.append(recall_file)
        f1score_files_abnormal.append(f1score_abnormal)
        precision_files_abnormal.append(precision_abnormal)
        recall_files_abnormal.append(recall_abnormal)
        f1score_files_normal.append(f1score_normal)
        precision_files_normal.append(precision_normal)
        recall_files_normal.append(recall_normal)

    print("---------- "+classifier_state+" ----------")
    print("DETECTION: "+ str(np.mean(detection_files_overall)*100) +" - "+ str(np.std(detection_files_overall)*100) )
    print("ABNORMAL:")
    print("F1-score: "+ str(np.mean(f1score_files_abnormal)*100) +" - "+ str(np.std(f1score_files_abnormal)*100) )
    print("Precision: "+ str(np.mean(precision_files_abnormal)*100) +" - "+ str(np.std(precision_files_abnormal)*100) )
    print("Recall: "+ str(np.mean(recall_files_abnormal)*100) +" - "+ str(np.std(recall_files_abnormal)*100) )
    print("NORMAL:")
    print("F1-score: "+ str(np.mean(f1score_files_normal)*100) +" - "+ str(np.std(f1score_files_normal)*100) )
    print("Precision: "+ str(np.mean(precision_files_normal)*100) +" - "+ str(np.std(precision_files_normal)*100) )
    print("Recall: "+ str(np.mean(recall_files_normal)*100) +" - "+ str(np.std(recall_files_normal)*100) )
    print("OVERALL:")
    print("F1-score: "+ str(np.mean(f1score_files_overall)*100) +" - "+ str(np.std(f1score_files_overall)*100) )
    print("Precision: "+ str(np.mean(precision_files_overall)*100) +" - "+ str(np.std(precision_files_overall)*100) )
    print("Recall: "+ str(np.mean(recall_files_overall)*100) +" - "+ str(np.std(recall_files_overall)*100) )

    dflocal = pd.DataFrame({'state_classifier':classifier_state, 
                            "detection mean": np.mean(detection_files_overall)*100,
                            'detection std': np.std(detection_files_overall)*100,
                            'Abnormal mean F1-score':np.mean(f1score_files_abnormal)*100,
                            'Normal mean F1-score':np.mean(f1score_files_normal)*100,
                            'Overall mean F1-score':np.mean(f1score_files_overall)*100,
                            'Abnormal std F1-score':np.std(f1score_files_abnormal)*100,
                            'Normal std F1-score':np.std(f1score_files_normal)*100,
                            'Overall std F1-score':np.std(f1score_files_overall)*100},index=[0])
        
    df_results_exp2 = pd.concat([df_results_exp2, dflocal], ignore_index=True)


classifiers=["xgboost","nb"]

detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

for classifier_state in classifiers:
    result_path = dir_exp2+classifier_state+"/"
    f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
    detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)

    print("---------- "+classifier_state+" ----------")
    print("DETECTION: "+ str(detection_file_overall*100) )
    print("ABNORMAL:")
    print("F1-score: "+ str(f1score_abnormal*100) )
    print("Precision: "+ str(precision_abnormal*100) )
    print("Recall: "+ str(recall_abnormal*100) )
    print("NORMAL:")
    print("F1-score: "+ str(f1score_normal*100) )
    print("Precision: "+ str(precision_normal*100) )
    print("Recall: "+ str(recall_normal*100) )
    print("OVERALL:")
    print("F1-score: "+ str(f1score_file*100)  )
    print("Precision: "+ str(precision_file*100) )
    print("Recall: "+ str(recall_file*100) )

    dflocal = pd.DataFrame({'state_classifier':classifier_state,
                            "detection mean": np.mean(detection_file_overall)*100, 
                            'Abnormal mean F1-score':np.mean(f1score_abnormal)*100,
                            'Normal mean F1-score':np.mean(f1score_normal)*100,
                            'Overall mean F1-score':np.mean(f1score_file)*100},index=[0])
        
    df_results_exp2 = pd.concat([df_results_exp2, dflocal], ignore_index=True)

df_results_exp2.to_csv(dir_exp2 + 'experiment2_results.csv',index=False)