Notebook corresponding to the "Approach-2" presented in the paper.

This is the same approach used in the ["Tinyml anomaly detection for industrial machines with periodic duty cycles" (Sensor Application Symposium 2024)](https://ieeexplore.ieee.org/abstract/document/10636584/), and serves as the baseline experiment.

Two experiments are carried on:
1) As in the SAS2024, the performance is evaluated in leave-one-month-out CV in the original 4 months (called DS1).
2) The generalization is evaluated using the whole DS1 for training and the whole DS2 for testing.

In [1]:
from custom_functions import *

import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# import data, extract feature and preprocessing

different functions used in the notebook

In [2]:
#classify each previously detected duty-cycle
def apply_heuristic_rules(df_testset):
    #establish a copy of detected labels, and then assign the normal/abnormal label
    df_testset["recognized_cycles"] = df_testset["detected_cycles"].copy(deep=True)

    # Create a list to store the non-repeated labels for each cycle
    cycle_transition_sequences = []

    # Initialize variables to track the start index, previous cycle value, and unique labels within each cycle
    start_index = None
    prev_cycle = None
    transition_sequence = []

    # list of valid sequence of Normal cycles
    normal_snequences = ["CDC", "CD"]

    # Iterate through the dataframe
    for index, row in df_testset.iterrows():
        current_cycle = row['detected_cycles']
        current_label = row['recognized_label']
        
        # Check if there is a transition from "no_cycles" to "cycles"
        if prev_cycle == "No_cycle" and current_cycle == "Cycle":
            start_index = index
            transition_sequence = []
        
        # Check if there is a transition from "cycles" to "no_cycles"
        elif prev_cycle == "Cycle" and current_cycle == "No_cycle":
            # Record the transition labels for the current cycle
            cycle_transition_sequences.append((start_index, index - 1, ''.join(transition_sequence)))
            start_index = None
            transition_sequence = []
        
        # Update the transition sequence within the current cycle
        if current_cycle == "Cycle":
            if transition_sequence and transition_sequence[-1] != current_label:
                transition_sequence.append(current_label)
            elif not transition_sequence:
                transition_sequence.append(current_label)
        
        # Update the previous cycle value
        prev_cycle = current_cycle

    # Check if the last cycle is ongoing and record its transition labels if it is
    if start_index is not None:
        cycle_transition_sequences.append((start_index, len(df_testset) - 1, ''.join(transition_sequence)))

    # I need to extend the analysis beyond the limits of the detected duty-cycles to see I there are missing values or 
    # the machine was off (state A) before/after the cicle
    extra_lenght = 3

    # Classify the Normal/Abnormal cycle according to the sequence label
    for cycle_start, cycle_end, transitions in cycle_transition_sequences:

        #first I check if the time difference is equal to 2 minute, otherwise means that there are missing values before/after the duty-cyle 
        i1 = cycle_start-extra_lenght if cycle_start-extra_lenght > 0 else 0
        i2 = cycle_end+extra_lenght if cycle_end+extra_lenght < df_testset.index[-1] else df_testset.index[-1]

        temp1 = (df_testset.iloc[cycle_start]["index"]-df_testset.iloc[i1]["index"]).total_seconds()
        temp2 = (df_testset.iloc[i2]["index"]-df_testset.iloc[cycle_end]["index"]).total_seconds()

        bool_condition1 = temp1==(60.0*extra_lenght) and temp2==(60.0*extra_lenght) #no missing values
        if extra_lenght==1:
            bool_condition2 = df_testset.loc[i1, 'recognized_label']=="B" #before was in idle state
            bool_condition3 = df_testset.loc[i2, 'recognized_label']=="B"#after was in idle state
        elif extra_lenght==2:
            bool_condition2 = df_testset.loc[i1, 'recognized_label']=="B" or df_testset.loc[i1+1, 'recognized_label']=="B" or df_testset.loc[i1+2, 'recognized_label']=="B" #before was in idle state
            bool_condition3 = df_testset.loc[i2, 'recognized_label']=="B" or df_testset.loc[i2-1, 'recognized_label']=="B" or df_testset.loc[i2-2, 'recognized_label']=="B" #after was in idle state
        elif extra_lenght==3:
            bool_condition2 = df_testset.loc[i1, 'recognized_label']=="B" or df_testset.loc[i1+1, 'recognized_label']=="B" #before was in idle state
            bool_condition3 = df_testset.loc[i2, 'recognized_label']=="B" or df_testset.loc[i2-1, 'recognized_label']=="B" #after was in idle state

        if bool_condition1 and bool_condition2 and bool_condition3 and (transitions in normal_snequences): #check the sequence inside the cycle
            df_testset.loc[cycle_start:cycle_end, 'recognized_cycles'] = "Normal"

        else:
            df_testset.loc[cycle_start:cycle_end, 'recognized_cycles'] = "Abnormal"

    return

#extract the duty-cycle bouts in a date range
def create_segments_cycle_classified(start_date,end_date,df_testset):
    complete_range = pd.date_range(start=start_date, end=end_date, freq='1min')
    complete_df = pd.DataFrame({'index': complete_range})

    filtered_df = df_testset[(df_testset['index'] >= start_date) & (df_testset['index'] <= end_date)]

    merged_df = pd.merge(complete_df, filtered_df, on='index', how='left')

    #Compute the start and end of each cycle
    merged_df['group'] = (merged_df['recognized_cycles'] != merged_df['recognized_cycles'].shift()).cumsum()

    dfs = []

    for group_name, group_data in merged_df.groupby('group'):
        label = group_data["recognized_cycles"].iloc[0]
        if (label != None):
            start = group_data["index"].iloc[0]
            finish = group_data["index"].iloc[-1]
            data_to_append = {'start': start, 'end': finish, 'label': label}
            df = pd.DataFrame(data_to_append,index=[group_name])
            dfs.append(df)

    df_classified_cycle = pd.concat(dfs, ignore_index=True)

    df_classified_cycle = df_classified_cycle[df_classified_cycle['label'] != 'No_cycle']
    df_classified_cycle.dropna(subset=['label'], inplace=True)
    df_classified_cycle = df_classified_cycle.reset_index(drop=True)

    return df_classified_cycle

input data

In [3]:
directory="../../../data/"
#first 4 months of data (DS1)
data_csv_jun21 = read_month_data(directory+'Confidential_Drive_data_Jun2021.csv',1)
data_csv_okt21 = read_month_data(directory+'Confidential_Drive_data_Okt2021.csv',1)
data_csv_jan22 = read_month_data(directory+'Confidential_Drive_data_Jan2022.csv',1)
data_csv_april22 = read_month_data(directory+'Confidential_Drive_data_April2022.csv',1)
#new 4 months (DS2)
data_csv_jun23 = read_month_data(directory+'Confidential_Drive_data_June2023_Drift20.csv')
data_csv_aug23 = read_month_data(directory+'Confidential_Drive_data_Aug2023_Drift20.csv')
data_csv_okt23 = read_month_data(directory+'Confidential_Drive_data_Oct2023_Drift20.csv')
data_csv_dec23 = read_month_data(directory+'Confidential_Drive_data_Dec2023_Drift20.csv')

#re-order the column name to be consistent with the previous csv files
desired_order=["High-pressure","Low-pressure","Speed"]
data_csv_jun23=data_csv_jun23[desired_order]
data_csv_aug23=data_csv_aug23[desired_order]
data_csv_okt23=data_csv_okt23[desired_order]
data_csv_dec23=data_csv_dec23[desired_order]

#These data has duplicated entries
data_csv_okt23 = data_csv_okt23[~data_csv_okt23.index.duplicated(keep='first')]

# round to zero speed less than zero
data_csv_jun21.loc[data_csv_jun21['Speed'] < 0 , 'Speed'] = 0
data_csv_okt21.loc[data_csv_okt21['Speed'] < 0 , 'Speed'] = 0
data_csv_jan22.loc[data_csv_jan22['Speed'] < 0 , 'Speed'] = 0
data_csv_april22.loc[data_csv_april22['Speed'] < 0 , 'Speed'] = 0
data_csv_jun23.loc[data_csv_jun23['Speed'] < 0 , 'Speed'] = 0
data_csv_aug23.loc[data_csv_aug23['Speed'] < 0 , 'Speed'] = 0
data_csv_okt23.loc[data_csv_okt23['Speed'] < 0 , 'Speed'] = 0
data_csv_dec23.loc[data_csv_dec23['Speed'] < 0 , 'Speed'] = 0


# complete the dataset with missing values
full_timestamp = pd.date_range(start = data_csv_jun21.index[0], end = data_csv_jun21.index[-1],inclusive="both",freq="1min" )
data_csv_jun21 = data_csv_jun21.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_okt21.index[0], end = data_csv_okt21.index[-1],inclusive="both",freq="1min" )
data_csv_okt21 = data_csv_okt21.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_jan22.index[0], end = data_csv_jan22.index[-1],inclusive="both",freq="1min" )
data_csv_jan22 = data_csv_jan22.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_april22.index[0], end = data_csv_april22.index[-1],inclusive="both",freq="1min" )
data_csv_april22 = data_csv_april22.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_jun23.index[0], end = data_csv_jun23.index[-1],inclusive="both",freq="1min" )
data_csv_jun23 = data_csv_jun23.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_aug23.index[0], end = data_csv_aug23.index[-1],inclusive="both",freq="1min" )
data_csv_aug23 = data_csv_aug23.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_okt23.index[0], end = data_csv_okt23.index[-1],inclusive="both",freq="1min" )
data_csv_okt23 = data_csv_okt23.reindex(full_timestamp)

full_timestamp = pd.date_range(start = data_csv_dec23.index[0], end = data_csv_dec23.index[-1],inclusive="both",freq="1min" )
data_csv_dec23 = data_csv_dec23.reindex(full_timestamp)



#use linear interpolation for the NaN missing values
interpolate_values(data_csv_jun21)
interpolate_values(data_csv_okt21)
interpolate_values(data_csv_jan22)
interpolate_values(data_csv_april22)
interpolate_values(data_csv_jun23)
interpolate_values(data_csv_aug23)
interpolate_values(data_csv_okt23)
interpolate_values(data_csv_dec23)

del desired_order, directory, full_timestamp

compute features

In [4]:
list_data_csv = [data_csv_jun21,data_csv_okt21,data_csv_jan22,data_csv_april22,data_csv_jun23,data_csv_aug23,data_csv_okt23,data_csv_dec23]
for data in list_data_csv:
    extract_features(data)

ground truth reference

In [5]:
#read files from imagimob
directory="../../data/"
column_interest=['Time(Seconds)' , 'Length(Seconds)',"Label(string)"]

#read labels of states
file_imagimob_1 = pd.read_csv(directory+"April_2022/Label.label",usecols=column_interest)
file_imagimob_2 = pd.read_csv(directory+"Jan_2022/Label.label",usecols=column_interest)
file_imagimob_3 = pd.read_csv(directory+"Jun_2021/Label.label",usecols=column_interest)
file_imagimob_4 = pd.read_csv(directory+"Okt_2021/Label.label",usecols=column_interest)

timestamps_april2022 = df_timestamps(file_imagimob_1)
timestamps_jan2022 = df_timestamps(file_imagimob_2)
timestamps_jun2021 = df_timestamps(file_imagimob_3)
timestamps_okt2021 = df_timestamps(file_imagimob_4)

#read labels of duty-cycle
file_imagimob_1 = pd.read_csv(directory+"April_2022/Label_cycle.label",usecols=column_interest)
file_imagimob_2 = pd.read_csv(directory+"Jan_2022/Label_cycle.label",usecols=column_interest)
file_imagimob_3 = pd.read_csv(directory+"Jun_2021/Label_cycle.label",usecols=column_interest)
file_imagimob_4 = pd.read_csv(directory+"Okt_2021/Label_cycle.label",usecols=column_interest)

timestamps_cycle_april2022 = df_timestamps(file_imagimob_1)
timestamps_cycle_jan2022 = df_timestamps(file_imagimob_2)
timestamps_cycle_jun2021 = df_timestamps(file_imagimob_3)
timestamps_cycle_okt2021 = df_timestamps(file_imagimob_4)

In [6]:
#generate vector with the labels of reference (states)
downsampled_freq='1T'
true_label_april22 = ndarray_labels(datetime.datetime(2022, 4, 1),datetime.datetime(2022, 5, 1),timestamps_april2022,downsampled_freq)
true_label_jan22 = ndarray_labels(datetime.datetime(2021, 12, 21),datetime.datetime(2022, 1, 21),timestamps_jan2022,downsampled_freq)
true_label_jun21 = ndarray_labels(datetime.datetime(2021, 6, 1),datetime.datetime(2021, 7, 1),timestamps_jun2021,downsampled_freq)
true_label_okt21 = ndarray_labels(datetime.datetime(2021, 10, 1),datetime.datetime(2021, 11, 1),timestamps_okt2021,downsampled_freq)

#generate vector with the labels of reference (duty-cycle)
true_label_cycle_april22 = ndarray_labels(datetime.datetime(2022, 4, 1),datetime.datetime(2022, 5, 1),timestamps_cycle_april2022,downsampled_freq)
true_label_cycle_jan22 = ndarray_labels(datetime.datetime(2021, 12, 21),datetime.datetime(2022, 1, 21),timestamps_cycle_jan2022,downsampled_freq)
true_label_cycle_jun21 = ndarray_labels(datetime.datetime(2021, 6, 1),datetime.datetime(2021, 7, 1),timestamps_cycle_jun2021,downsampled_freq)
true_label_cycle_okt21 = ndarray_labels(datetime.datetime(2021, 10, 1),datetime.datetime(2021, 11, 1),timestamps_cycle_okt2021,downsampled_freq)

true_label_cycle_april22 = np.where(true_label_cycle_april22 == None, 'No_cycle', true_label_cycle_april22)
true_label_cycle_jan22 = np.where(true_label_cycle_jan22 == None, 'No_cycle', true_label_cycle_jan22)
true_label_cycle_jun21 = np.where(true_label_cycle_jun21 == None, 'No_cycle', true_label_cycle_jun21)
true_label_cycle_okt21 = np.where(true_label_cycle_okt21 == None, 'No_cycle', true_label_cycle_okt21)

imput ground-truth duty-cycle labels

In [7]:
#read files from imagimob
directory="../../data/"
#read labels of duty-cycle
labels_jun21 = import_cycle_labels(directory+"Jun_2021/Label_cycle.label")
labels_okt21 = import_cycle_labels(directory+"Okt_2021/Label_cycle.label")
labels_jan22 = import_cycle_labels(directory+"Jan_2022/Label_cycle.label")
labels_april22 = import_cycle_labels(directory+"April_2022/Label_cycle.label")
labels_jun23 = import_cycle_labels(directory+"June_23/Label_cycle.label")
labels_aug23 = import_cycle_labels(directory+"Aug_23/Label_cycle.label")
labels_okt23 = import_cycle_labels(directory+"Okt_23/Label_cycle.label")
labels_dec23 = import_cycle_labels(directory+"Dec_23/Label_cycle.label")


for data in [labels_jun23,labels_aug23,labels_okt23,labels_dec23]:
    replace_labels_cycles(data)

 Data preparation and pre-processing

In [8]:
true_label_jun21 [true_label_jun21=='E']='B'
true_label_okt21 [true_label_okt21=='E']='B'
true_label_jan22 [true_label_jan22=='E']='B'
true_label_april22 [true_label_april22=='E']='B'

data_DS1=[data_csv_jun21, data_csv_okt21,data_csv_jan22,data_csv_april22]
data_DS2=[data_csv_jun23, data_csv_aug23,data_csv_okt23,data_csv_dec23]

true_state_labels_DS1=[true_label_jun21, true_label_okt21,true_label_jan22,true_label_april22]
df_testset_DS2= pd.concat(data_DS2)

file_name_states_DS1= ["jun2021_state.txt" ,"okt2021_state.txt","jan2022_state.txt","april2022_state.txt"]
file_name_cycles_DS1= ["jun2021_cycle.txt" ,"okt2021_cycle.txt","jan2022_cycle.txt","april2022_cycle.txt"]

scaler = MinMaxScaler()

dir_exp1 = "./results/approach2/DS1/"
dir_exp2 = "./results/approach2/DS2/"

flag_save_results=True

delete not requires variables

In [9]:
del timestamps_april2022, timestamps_jan2022, timestamps_jun2021, timestamps_okt2021
del file_imagimob_1,file_imagimob_2,file_imagimob_3,file_imagimob_4, column_interest, directory

# Experiment 1
Train/test on DS1 using leave-one-month CV

In [None]:
seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]

for i in range(len(data_DS1)):
    # train_set
    df_dataset = pd.concat([data for j, data in enumerate(data_DS1) if j != i])
    df_dataset["ref_label"]= np.concatenate([data for j, data in enumerate(true_state_labels_DS1) if j != i])

    removed_indices = df_dataset[df_dataset['ref_label'].isnull()].index.tolist()
    df_dataset = df_dataset[df_dataset['ref_label'].notnull()]
    df_dataset=df_dataset.reset_index()

    x_train = df_dataset[df_dataset.columns[1:-1]]
    y_train = df_dataset[df_dataset.columns[-1]]
    x_train_balanced, y_train_balanced, le = balance_dataset(x_train,y_train)
    x_train_balanced=pd.DataFrame(scaler.fit_transform(x_train_balanced), columns=x_train.columns)
    
    #test_set
    x_test=pd.DataFrame(scaler.transform(data_DS1[i]), columns=x_train.columns)

    for seed in seeds:
        for classifier in classifiers:
            #train/test states
            clf = train_state_supervised_classifier(classifier,x_train_balanced, y_train_balanced,seed)
            y_predict = clf.predict(x_test)

            #apply 3rd median filter
            y_pred_smoothed = smooth_labels(y_predict,3)
            
            y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
            df_temp=data_DS1[i].copy()
            df_temp["recognized_label"]=y_recognized

            # df_temp=df_testset.copy()
            df_temp=df_temp.reset_index()

            #classify duty-cycle
            df_recognized_states = create_segments_state(data_DS1[i].index[0],data_DS1[i].index[-1],df_temp)

            # save the results in files
            if flag_save_results:
                folder_path = dir_exp1+classifier+"/"+str(seed)+"/"
                os.makedirs(folder_path, exist_ok=True)

                create_reference_label_file(folder_path+file_name_states_DS1[i],df_recognized_states)

In [None]:
seed=0
classifiers=["nb","xgboost"]

for i in range(len(data_DS1)):
    # train_set
    df_dataset = pd.concat([data for j, data in enumerate(data_DS1) if j != i])
    df_dataset["ref_label"]= np.concatenate([data for j, data in enumerate(true_state_labels_DS1) if j != i])

    removed_indices = df_dataset[df_dataset['ref_label'].isnull()].index.tolist()
    df_dataset = df_dataset[df_dataset['ref_label'].notnull()]
    df_dataset=df_dataset.reset_index()

    x_train = df_dataset[df_dataset.columns[1:-1]]
    y_train = df_dataset[df_dataset.columns[-1]]
    x_train_balanced, y_train_balanced, le = balance_dataset(x_train,y_train)
    x_train_balanced=pd.DataFrame(scaler.fit_transform(x_train_balanced), columns=x_train.columns)
    
    #test_set
    x_test=pd.DataFrame(scaler.transform(data_DS1[i]), columns=x_train.columns)

    for classifier in classifiers:
        #train/test states
        clf = train_state_supervised_classifier(classifier,x_train_balanced, y_train_balanced,seed)
        y_predict = clf.predict(x_test)

        #apply 3rd median filter
        y_pred_smoothed = smooth_labels(y_predict,3)
        
        y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
        df_temp=data_DS1[i].copy()
        df_temp["recognized_label"]=y_recognized

        # df_temp=df_testset.copy()
        df_temp=df_temp.reset_index()

        #classify duty-cycle
        df_recognized_states = create_segments_state(data_DS1[i].index[0],data_DS1[i].index[-1],df_temp)

        # save the results in files
        if flag_save_results:
            folder_path = dir_exp1+classifier+"/"
            os.makedirs(folder_path, exist_ok=True)

            create_reference_label_file(folder_path+file_name_states_DS1[i],df_recognized_states)

Once I have all the predicted state labels, I can apply detection using a threshold and its subsequent classification.

In [None]:
downsampled_freq='1T'
seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]

# classifier with seeds parameters
for seed in seeds:
    for classifier in classifiers:
    
        #import classified state_labels
        folder_path = dir_exp1+classifier+"/"+str(seed)
        aux = df_timestamps(pd.read_csv(folder_path+"/jun2021_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
        data_csv_jun21["recognized_label"] = ndarray_labels(data_csv_jun21.index[0],data_csv_jun21.index[-1],aux,downsampled_freq)
        aux = df_timestamps(pd.read_csv(folder_path+"/okt2021_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
        data_csv_okt21["recognized_label"] = ndarray_labels(data_csv_okt21.index[0],data_csv_okt21.index[-1],aux,downsampled_freq)
        aux = df_timestamps(pd.read_csv(folder_path+"/jan2022_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
        data_csv_jan22["recognized_label"] = ndarray_labels(data_csv_jan22.index[0],data_csv_jan22.index[-1],aux,downsampled_freq)
        aux = df_timestamps(pd.read_csv(folder_path+"/april2022_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
        data_csv_april22["recognized_label"] = ndarray_labels(data_csv_april22.index[0],data_csv_april22.index[-1],aux,downsampled_freq)
        #form testset
        df_testset= pd.concat([data_csv_jun21, data_csv_okt21,data_csv_jan22,data_csv_april22])

        #apply detection threshold
        df_testset.reset_index(inplace=True)
        df_testset["detected_cycles"]='No_cycle'
        df_testset.loc[df_testset['Speed_order3'] > 2.5, 'detected_cycles'] = 'Cycle'

        #classify duty_cycle
        apply_heuristic_rules(df_testset)

        #save results
        df_recognized_cycles_jun21 = create_segments_cycle_classified(data_csv_jun21.index[0],data_csv_jun21.index[-1],df_testset=df_testset)
        df_recognized_cycles_okt21 = create_segments_cycle_classified(data_csv_okt21.index[0],data_csv_okt21.index[-1],df_testset=df_testset)
        df_recognized_cycles_jan22 = create_segments_cycle_classified(data_csv_jan22.index[0],data_csv_jan22.index[-1],df_testset=df_testset)
        df_recognized_cycles_april22 = create_segments_cycle_classified(data_csv_april22.index[0],data_csv_april22.index[-1],df_testset=df_testset)
        if flag_save_results:
            create_reference_label_file(folder_path+"/jun2021_cycle.txt",df_recognized_cycles_jun21)
            create_reference_label_file(folder_path+"/okt2021_cycle.txt",df_recognized_cycles_okt21)
            create_reference_label_file(folder_path+"/jan2022_cycle.txt",df_recognized_cycles_jan22)
            create_reference_label_file(folder_path+"/april2022_cycle.txt",df_recognized_cycles_april22)
            print("Results saved in: "+folder_path)



# classifier without seeds parameters
classifiers=["xgboost","nb"]

for classifier in classifiers:

    #import classified state_labels
    folder_path = dir_exp1+classifier
    aux = df_timestamps(pd.read_csv(folder_path+"/jun2021_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
    data_csv_jun21["recognized_label"] = ndarray_labels(data_csv_jun21.index[0],data_csv_jun21.index[-1],aux,downsampled_freq)
    aux = df_timestamps(pd.read_csv(folder_path+"/okt2021_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
    data_csv_okt21["recognized_label"] = ndarray_labels(data_csv_okt21.index[0],data_csv_okt21.index[-1],aux,downsampled_freq)
    aux = df_timestamps(pd.read_csv(folder_path+"/jan2022_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
    data_csv_jan22["recognized_label"] = ndarray_labels(data_csv_jan22.index[0],data_csv_jan22.index[-1],aux,downsampled_freq)
    aux = df_timestamps(pd.read_csv(folder_path+"/april2022_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
    data_csv_april22["recognized_label"] = ndarray_labels(data_csv_april22.index[0],data_csv_april22.index[-1],aux,downsampled_freq)

    #form testset
    df_testset= pd.concat([data_csv_jun21, data_csv_okt21,data_csv_jan22,data_csv_april22])

    #apply detection threshold
    df_testset.reset_index(inplace=True)
    df_testset["detected_cycles"]='No_cycle'
    df_testset.loc[df_testset['Speed_order3'] > 2.5, 'detected_cycles'] = 'Cycle'

    #classify duty_cycle
    apply_heuristic_rules(df_testset)

    #save results
    df_recognized_cycles_jun21 = create_segments_cycle_classified(data_csv_jun21.index[0],data_csv_jun21.index[-1],df_testset=df_testset)
    df_recognized_cycles_okt21 = create_segments_cycle_classified(data_csv_okt21.index[0],data_csv_okt21.index[-1],df_testset=df_testset)
    df_recognized_cycles_jan22 = create_segments_cycle_classified(data_csv_jan22.index[0],data_csv_jan22.index[-1],df_testset=df_testset)
    df_recognized_cycles_april22 = create_segments_cycle_classified(data_csv_april22.index[0],data_csv_april22.index[-1],df_testset=df_testset)

    if flag_save_results:
        create_reference_label_file(folder_path+"/jun2021_cycle.txt",df_recognized_cycles_jun21)
        create_reference_label_file(folder_path+"/okt2021_cycle.txt",df_recognized_cycles_okt21)
        create_reference_label_file(folder_path+"/jan2022_cycle.txt",df_recognized_cycles_jan22)
        create_reference_label_file(folder_path+"/april2022_cycle.txt",df_recognized_cycles_april22)


## performance

In [None]:
df_results_exp1 = pd.DataFrame(columns=['state_classifier', "detection mean",'detection std',
                                        'Abnormal mean F1-score','Abnormal std F1-score',
                                        'Normal mean F1-score','Normal std F1-score',
                                        'Overall mean F1-score','Overall std F1-score'])

seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]

dir = './results/reference_cycle_labels/'
reference_path = dir
collar = 202.75

detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

for classifier_state in classifiers:
    detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
    detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
    detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

    for seed_cycle in seeds:
        result_path = dir_exp1+classifier_state+"/"+str(seed_cycle)+"/"
        f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
        detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)
        detection_files_overall.append(detection_file_overall)
        f1score_files_overall.append(f1score_file)
        precision_files_overall.append(precision_file)
        recall_files_overall.append(recall_file)
        f1score_files_abnormal.append(f1score_abnormal)
        precision_files_abnormal.append(precision_abnormal)
        recall_files_abnormal.append(recall_abnormal)
        f1score_files_normal.append(f1score_normal)
        precision_files_normal.append(precision_normal)
        recall_files_normal.append(recall_normal)

    print("---------- "+classifier_state+" ----------")
    print("DETECTION: "+ str(np.mean(detection_files_overall)*100) +" - "+ str(np.std(detection_files_overall)*100) )
    print("ABNORMAL:")
    print("F1-score: "+ str(np.mean(f1score_files_abnormal)*100) +" - "+ str(np.std(f1score_files_abnormal)*100) )
    print("Precision: "+ str(np.mean(precision_files_abnormal)*100) +" - "+ str(np.std(precision_files_abnormal)*100) )
    print("Recall: "+ str(np.mean(recall_files_abnormal)*100) +" - "+ str(np.std(recall_files_abnormal)*100) )
    print("NORMAL:")
    print("F1-score: "+ str(np.mean(f1score_files_normal)*100) +" - "+ str(np.std(f1score_files_normal)*100) )
    print("Precision: "+ str(np.mean(precision_files_normal)*100) +" - "+ str(np.std(precision_files_normal)*100) )
    print("Recall: "+ str(np.mean(recall_files_normal)*100) +" - "+ str(np.std(recall_files_normal)*100) )
    print("OVERALL:")
    print("F1-score: "+ str(np.mean(f1score_files_overall)*100) +" - "+ str(np.std(f1score_files_overall)*100) )
    print("Precision: "+ str(np.mean(precision_files_overall)*100) +" - "+ str(np.std(precision_files_overall)*100) )
    print("Recall: "+ str(np.mean(recall_files_overall)*100) +" - "+ str(np.std(recall_files_overall)*100) )

    dflocal = pd.DataFrame({'state_classifier':classifier_state, 
                            "detection mean": np.mean(detection_files_overall)*100,
                            'detection std': np.std(detection_files_overall)*100,
                            'Abnormal mean F1-score':np.mean(f1score_files_abnormal)*100,
                            'Normal mean F1-score':np.mean(f1score_files_normal)*100,
                            'Overall mean F1-score':np.mean(f1score_files_overall)*100,
                            'Abnormal std F1-score':np.std(f1score_files_abnormal)*100,
                            'Normal std F1-score':np.std(f1score_files_normal)*100,
                            'Overall std F1-score':np.std(f1score_files_overall)*100},index=[0])
        
    df_results_exp1 = pd.concat([df_results_exp1, dflocal], ignore_index=True)

classifiers=["xgboost","nb"]

detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

for classifier_state in classifiers:
    result_path = dir_exp1+classifier_state+"/"
    f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
    detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)

    print("---------- "+classifier_state+" ----------")
    print("DETECTION: "+ str(detection_file_overall*100) )
    print("ABNORMAL:")
    print("F1-score: "+ str(f1score_abnormal*100) )
    print("Precision: "+ str(precision_abnormal*100) )
    print("Recall: "+ str(recall_abnormal*100) )
    print("NORMAL:")
    print("F1-score: "+ str(f1score_normal*100) )
    print("Precision: "+ str(precision_normal*100) )
    print("Recall: "+ str(recall_normal*100) )
    print("OVERALL:")
    print("F1-score: "+ str(f1score_file*100)  )
    print("Precision: "+ str(precision_file*100) )
    print("Recall: "+ str(recall_file*100) )

    dflocal = pd.DataFrame({'state_classifier':classifier_state, 
                            "detection mean": np.mean(detection_file_overall)*100,
                            'Abnormal mean F1-score':np.mean(f1score_abnormal)*100,
                            'Normal mean F1-score':np.mean(f1score_normal)*100,
                            'Overall mean F1-score':np.mean(f1score_file)*100},index=[0])
        
    df_results_exp1 = pd.concat([df_results_exp1, dflocal], ignore_index=True)

df_results_exp1.to_csv(dir_exp1 + 'experiment1_results.csv',index=False)

# Experiment 2
Train in DS1 and test in DS2

Data preparation and pre-processing

In [None]:
df_dataset= pd.concat([data_csv_jun21, data_csv_okt21,data_csv_jan22,data_csv_april22])
df_dataset["ref_label"]= np.concatenate((true_label_jun21, true_label_okt21,true_label_jan22,true_label_april22), axis=0)    
df_dataset["ref_label_cycle"]= np.concatenate((true_label_cycle_jun21, true_label_cycle_okt21,true_label_cycle_jan22,true_label_cycle_april22), axis=0)    

removed_indices = df_dataset[df_dataset['ref_label'].isnull()].index.tolist()
df_dataset = df_dataset[df_dataset['ref_label'].notnull()]
df_dataset=df_dataset.reset_index()

# remove the recognized_label column added in the experiment1
if 'recognized_label' in df_dataset.columns:
    df_dataset = df_dataset.drop('recognized_label', axis=1)

x = df_dataset[df_dataset.columns[1:-2]]
y_cycle = df_dataset[df_dataset.columns[-1]]
y_state = df_dataset[df_dataset.columns[-2]]

# normalize feature to range [0;1]
scaler = MinMaxScaler()
scaler.fit(x,4)
x = pd.DataFrame(scaler.transform(x), columns=x.columns)

y_state [y_state=='E']='B'

flag_save_results=True

balance ds1 for training

In [None]:
x_train_balanced, y_train_balanced,le = balance_dataset(x,y_state)

# Print balanced dataset
unique_values, counts = np.unique(y_state, return_counts=True)
value_counts = dict(zip(unique_values, counts))
value_porcentages = dict(zip(unique_values, counts/sum(counts)*100))
print("Value class-counts in Unbalanced dataset:",value_counts)
print("Value class-porcentage in Unbalanced dataset:",value_porcentages)

unique_values, counts = np.unique(y_train_balanced, return_counts=True)
value_counts = dict(zip(unique_values, counts))
value_porcentages = dict(zip(unique_values, counts/sum(counts)*100))
print("Value class-counts in Balanced dataset:",value_counts)
print("Value class-porcentage in Balanced dataset:",value_porcentages)

del unique_values,counts,value_counts,value_porcentages

In [None]:
#test data (DS2)
df_testset= pd.concat([data_csv_jun23, data_csv_aug23,data_csv_okt23,data_csv_dec23])
x_test = pd.DataFrame(scaler.transform(df_testset), columns=df_testset.columns)

Train the state classifiers using only the data belonging to duty-cycles

In [None]:
seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]
for seed in seeds:
    for classifier in classifiers:
        #train
        clf = train_state_supervised_classifier(classifier,x_train_balanced, y_train_balanced,seed)

        #test state-cycles
        y_predict = clf.predict(x_test)

        #apply 3rd median filter
        y_pred_smoothed = smooth_labels(y_predict,3)
        
        y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
        df_testset["recognized_label"]=y_recognized

        df_temp=df_testset.copy()
        df_temp=df_temp.reset_index()

        
        df_recognized_states_jun23 = create_segments_state(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_temp)
        df_recognized_states_aug23 = create_segments_state(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_temp)
        df_recognized_states_okt23 = create_segments_state(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_temp)
        df_recognized_states_dec23 = create_segments_state(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_temp)
        
        # save the results in files
        if flag_save_results:
            folder_path = dir_exp2+classifier+"/"+str(seed)
            os.makedirs(folder_path, exist_ok=True)

            create_reference_label_file(folder_path+"/jun23_state.txt",df_recognized_states_jun23)
            create_reference_label_file(folder_path+"/aug23_state.txt",df_recognized_states_aug23)
            create_reference_label_file(folder_path+"/okt23_state.txt",df_recognized_states_okt23)
            create_reference_label_file(folder_path+"/dec23_state.txt",df_recognized_states_dec23)

In [None]:
classifiers=["xgboost","nb"]
for classifier in classifiers:
    #train
    clf = train_state_supervised_classifier(classifier,x_train_balanced, y_train_balanced,seed)

    #test state-cycles
    y_predict = clf.predict(x_test)

    #apply 3rd median filter
    y_pred_smoothed = smooth_labels(y_predict,3)
    
    y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
    df_testset["recognized_label"]=y_recognized

    df_temp=df_testset.copy()
    df_temp=df_temp.reset_index()

    #classify duty-cycle
    df_recognized_states_jun23 = create_segments_state(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_temp)
    df_recognized_states_aug23 = create_segments_state(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_temp)
    df_recognized_states_okt23 = create_segments_state(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_temp)
    df_recognized_states_dec23 = create_segments_state(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_temp)

    # save the results in files
    if flag_save_results:
        folder_path = dir_exp2+classifier
        os.makedirs(folder_path, exist_ok=True)

        create_reference_label_file(folder_path+"/jun23_state.txt",df_recognized_states_jun23)
        create_reference_label_file(folder_path+"/aug23_state.txt",df_recognized_states_aug23)
        create_reference_label_file(folder_path+"/okt23_state.txt",df_recognized_states_okt23)
        create_reference_label_file(folder_path+"/dec23_state.txt",df_recognized_states_dec23)

Once I have all the state labels, I can apply detection using a threshold and its subsequent classification.

In [None]:
downsampled_freq='1T'
seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]

# classifier with seeds parameters
for seed in seeds:
    for classifier in classifiers:
    
        #import classified state_labels
        folder_path = dir_exp2+classifier+"/"+str(seed)
        aux = df_timestamps(pd.read_csv(folder_path+"/jun23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
        data_csv_jun23["recognized_label"] = ndarray_labels(data_csv_jun23.index[0],data_csv_jun23.index[-1],aux,downsampled_freq)
        aux = df_timestamps(pd.read_csv(folder_path+"/aug23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
        data_csv_aug23["recognized_label"] = ndarray_labels(data_csv_aug23.index[0],data_csv_aug23.index[-1],aux,downsampled_freq)
        aux = df_timestamps(pd.read_csv(folder_path+"/okt23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
        data_csv_okt23["recognized_label"] = ndarray_labels(data_csv_okt23.index[0],data_csv_okt23.index[-1],aux,downsampled_freq)
        aux = df_timestamps(pd.read_csv(folder_path+"/dec23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
        data_csv_dec23["recognized_label"] = ndarray_labels(data_csv_dec23.index[0],data_csv_dec23.index[-1],aux,downsampled_freq)

        #form testset
        df_testset= pd.concat([data_csv_jun23, data_csv_aug23,data_csv_okt23,data_csv_dec23])

        #apply detection threshold
        df_testset.reset_index(inplace=True)
        df_testset["detected_cycles"]='No_cycle'
        df_testset.loc[df_testset['Speed_order3'] > 2.5, 'detected_cycles'] = 'Cycle'

        #classify duty_cycle
        apply_heuristic_rules(df_testset)

        #save results
        df_recognized_cycles_jun23 = create_segments_cycle_classified(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_testset=df_testset)
        df_recognized_cycles_aug23 = create_segments_cycle_classified(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_testset=df_testset)
        df_recognized_cycles_okt23 = create_segments_cycle_classified(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_testset=df_testset)
        df_recognized_cycles_dec23 = create_segments_cycle_classified(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_testset=df_testset)

        if flag_save_results:
            create_reference_label_file(folder_path+"/jun23_cycle.txt",df_recognized_cycles_jun23)
            create_reference_label_file(folder_path+"/aug23_cycle.txt",df_recognized_cycles_aug23)
            create_reference_label_file(folder_path+"/okt23_cycle.txt",df_recognized_cycles_okt23)
            create_reference_label_file(folder_path+"/dec23_cycle.txt",df_recognized_cycles_dec23)

# classifier without seeds parameters
classifiers=["xgboost","nb"]

for classifier in classifiers:

    #import classified state_labels
    folder_path = "../results/recognized/experiment_supervised2/DS2/"+classifier
    aux = df_timestamps(pd.read_csv(folder_path+"/jun23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
    data_csv_jun23["recognized_label"] = ndarray_labels(data_csv_jun23.index[0],data_csv_jun23.index[-1],aux,downsampled_freq)
    aux = df_timestamps(pd.read_csv(folder_path+"/aug23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
    data_csv_aug23["recognized_label"] = ndarray_labels(data_csv_aug23.index[0],data_csv_aug23.index[-1],aux,downsampled_freq)
    aux = df_timestamps(pd.read_csv(folder_path+"/okt23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
    data_csv_okt23["recognized_label"] = ndarray_labels(data_csv_okt23.index[0],data_csv_okt23.index[-1],aux,downsampled_freq)
    aux = df_timestamps(pd.read_csv(folder_path+"/dec23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
    data_csv_dec23["recognized_label"] = ndarray_labels(data_csv_dec23.index[0],data_csv_dec23.index[-1],aux,downsampled_freq)

    #form testset
    df_testset= pd.concat([data_csv_jun23, data_csv_aug23,data_csv_okt23,data_csv_dec23])

    #apply detection threshold
    df_testset.reset_index(inplace=True)
    df_testset["detected_cycles"]='No_cycle'
    df_testset.loc[df_testset['Speed_order3'] > 2.5, 'detected_cycles'] = 'Cycle'

    #classify duty_cycle
    apply_heuristic_rules(df_testset)

    #save results
    df_recognized_cycles_jun23 = create_segments_cycle_classified(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_testset=df_testset)
    df_recognized_cycles_aug23 = create_segments_cycle_classified(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_testset=df_testset)
    df_recognized_cycles_okt23 = create_segments_cycle_classified(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_testset=df_testset)
    df_recognized_cycles_dec23 = create_segments_cycle_classified(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_testset=df_testset)

    if flag_save_results:
        create_reference_label_file(folder_path+"/jun23_cycle.txt",df_recognized_cycles_jun23)
        create_reference_label_file(folder_path+"/aug23_cycle.txt",df_recognized_cycles_aug23)
        create_reference_label_file(folder_path+"/okt23_cycle.txt",df_recognized_cycles_okt23)
        create_reference_label_file(folder_path+"/dec23_cycle.txt",df_recognized_cycles_dec23)


## performance

In [None]:
df_results_exp2 = pd.DataFrame(columns=['state_classifier', "detection mean",'detection std',
                                        'Abnormal mean F1-score','Abnormal std F1-score',
                                        'Normal mean F1-score','Normal std F1-score',
                                        'Overall mean F1-score','Overall std F1-score'])

seeds=list(range(0,10))
classifiers=["rf","dt","xtree","mlp"]

dir = './results/reference_cycle_labels/'
reference_path = dir
collar = 202.75

detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

for classifier_state in classifiers:
    detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
    detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
    detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

    for seed_cycle in seeds:
        result_path = dir_exp2+classifier_state+"/"+str(seed_cycle)+"/"
        f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
        detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)
        detection_files_overall.append(detection_file_overall)
        f1score_files_overall.append(f1score_file)
        precision_files_overall.append(precision_file)
        recall_files_overall.append(recall_file)
        f1score_files_abnormal.append(f1score_abnormal)
        precision_files_abnormal.append(precision_abnormal)
        recall_files_abnormal.append(recall_abnormal)
        f1score_files_normal.append(f1score_normal)
        precision_files_normal.append(precision_normal)
        recall_files_normal.append(recall_normal)

    print("---------- "+classifier_state+" ----------")
    print("DETECTION: "+ str(np.mean(detection_files_overall)*100) +" - "+ str(np.std(detection_files_overall)*100) )
    print("ABNORMAL:")
    print("F1-score: "+ str(np.mean(f1score_files_abnormal)*100) +" - "+ str(np.std(f1score_files_abnormal)*100) )
    print("Precision: "+ str(np.mean(precision_files_abnormal)*100) +" - "+ str(np.std(precision_files_abnormal)*100) )
    print("Recall: "+ str(np.mean(recall_files_abnormal)*100) +" - "+ str(np.std(recall_files_abnormal)*100) )
    print("NORMAL:")
    print("F1-score: "+ str(np.mean(f1score_files_normal)*100) +" - "+ str(np.std(f1score_files_normal)*100) )
    print("Precision: "+ str(np.mean(precision_files_normal)*100) +" - "+ str(np.std(precision_files_normal)*100) )
    print("Recall: "+ str(np.mean(recall_files_normal)*100) +" - "+ str(np.std(recall_files_normal)*100) )
    print("OVERALL:")
    print("F1-score: "+ str(np.mean(f1score_files_overall)*100) +" - "+ str(np.std(f1score_files_overall)*100) )
    print("Precision: "+ str(np.mean(precision_files_overall)*100) +" - "+ str(np.std(precision_files_overall)*100) )
    print("Recall: "+ str(np.mean(recall_files_overall)*100) +" - "+ str(np.std(recall_files_overall)*100) )

    dflocal = pd.DataFrame({'state_classifier':classifier_state, 
                            "detection mean": np.mean(detection_files_overall)*100,
                            'detection std': np.std(detection_files_overall)*100,
                            'Abnormal mean F1-score':np.mean(f1score_files_abnormal)*100,
                            'Normal mean F1-score':np.mean(f1score_files_normal)*100,
                            'Overall mean F1-score':np.mean(f1score_files_overall)*100,
                            'Abnormal std F1-score':np.std(f1score_files_abnormal)*100,
                            'Normal std F1-score':np.std(f1score_files_normal)*100,
                            'Overall std F1-score':np.std(f1score_files_overall)*100},index=[0])
        
    df_results_exp2 = pd.concat([df_results_exp2, dflocal], ignore_index=True)


classifiers=["xgboost","nb"]

detection_files_abnormal, f1score_files_abnormal, precision_files_abnormal, recall_files_abnormal = [],[],[],[]
detection_files_normal, f1score_files_normal, precision_files_normal, recall_files_normal = [],[],[],[]
detection_files_overall, f1score_files_overall, precision_files_overall, recall_files_overall = [],[],[],[]

for classifier_state in classifiers:
    result_path = dir_exp2+classifier_state+"/"
    f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
    detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)

    print("---------- "+classifier_state+" ----------")
    print("DETECTION: "+ str(detection_file_overall*100) )
    print("ABNORMAL:")
    print("F1-score: "+ str(f1score_abnormal*100) )
    print("Precision: "+ str(precision_abnormal*100) )
    print("Recall: "+ str(recall_abnormal*100) )
    print("NORMAL:")
    print("F1-score: "+ str(f1score_normal*100) )
    print("Precision: "+ str(precision_normal*100) )
    print("Recall: "+ str(recall_normal*100) )
    print("OVERALL:")
    print("F1-score: "+ str(f1score_file*100)  )
    print("Precision: "+ str(precision_file*100) )
    print("Recall: "+ str(recall_file*100) )

    dflocal = pd.DataFrame({'state_classifier':classifier_state,
                            "detection mean": np.mean(detection_file_overall)*100, 
                            'Abnormal mean F1-score':np.mean(f1score_abnormal)*100,
                            'Normal mean F1-score':np.mean(f1score_normal)*100,
                            'Overall mean F1-score':np.mean(f1score_file)*100},index=[0])
        
    df_results_exp2 = pd.concat([df_results_exp2, dflocal], ignore_index=True)

df_results_exp2.to_csv(dir_exp2 + 'experiment2_results.csv',index=False)

# Deployment on MCU

Data preparation and pre-processing

In [10]:
df_dataset= pd.concat([data_csv_jun21, data_csv_okt21,data_csv_jan22,data_csv_april22])

df_dataset["ref_label"]= np.concatenate((true_label_jun21, true_label_okt21,true_label_jan22,true_label_april22), axis=0)    
df_dataset["ref_label_cycle"]= np.concatenate((true_label_cycle_jun21, true_label_cycle_okt21,true_label_cycle_jan22,true_label_cycle_april22), axis=0)    

removed_indices = df_dataset[df_dataset['ref_label'].isnull()].index.tolist()
df_dataset = df_dataset[df_dataset['ref_label'].notnull()]
df_dataset=df_dataset.reset_index()

# remove the recognized_label column added in the experiment1 or experiment2
if 'recognized_label' in df_dataset.columns:
    df_dataset = df_dataset.drop('recognized_label', axis=1)

x = df_dataset[df_dataset.columns[1:-2]]
y_cycle = df_dataset[df_dataset.columns[-1]]
y_state = df_dataset[df_dataset.columns[-2]]

y_state [y_state=='E']='B'

# normalize feature to range [0;1]
scaler = MinMaxScaler(clip=True)
scaler.fit(x)
x_train = pd.DataFrame(scaler.transform(x), columns=x.columns)

y_state [y_state=='E']='B'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_state [y_state=='E']='B'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_state [y_state=='E']='B'


balance ds1 for training

In [11]:
x_train_balanced, y_train_balanced,le = balance_dataset(x_train,y_state)

# Print balanced dataset
unique_values, counts = np.unique(y_state, return_counts=True)
value_counts = dict(zip(unique_values, counts))
value_porcentages = dict(zip(unique_values, counts/sum(counts)*100))
print("Value class-counts in Unbalanced dataset:",value_counts)
print("Value class-porcentage in Unbalanced dataset:",value_porcentages)

unique_values, counts = np.unique(y_train_balanced, return_counts=True)
value_counts = dict(zip(unique_values, counts))
value_porcentages = dict(zip(unique_values, counts/sum(counts)*100))
print("Value class-counts in Balanced dataset:",value_counts)
print("Value class-porcentage in Balanced dataset:",value_porcentages)

del unique_values,counts,value_counts,value_porcentages

Value class-counts in Unbalanced dataset: {'A': 10566, 'B': 123236, 'C': 11444, 'D': 24298}
Value class-porcentage in Unbalanced dataset: {'A': 6.232010569527674, 'B': 72.68673618647667, 'C': 6.7498702401736415, 'D': 14.331383003822019}
Value class-counts in Balanced dataset: {0: 21132, 1: 30809, 2: 22888, 3: 24298}
Value class-porcentage in Balanced dataset: {0: 21.31810707476268, 1: 31.080331292180734, 2: 23.089571963239077, 3: 24.511989669817506}


In [12]:
print("SCALED VALUES TO BE ADDED IN THE C-CODE")
print(f"Minimun: ",scaler.data_min_)
print(f"Maximun: ",scaler.data_max_)

SCALED VALUES TO BE ADDED IN THE C-CODE
Minimun:  [ 0.          0.          0.          0.          0.          0.
 -1.34333333  0.          0.          0.         -1.194      -1.37      ]
Maximun:  [155.93        41.73        45.55        45.39333333 155.35333333
  41.31333333 120.82333333  45.36       154.672       40.976
 119.466      123.51      ]


train and test data

In [13]:
#test data (DS2)
df_testset= pd.concat([data_csv_jun23, data_csv_aug23,data_csv_okt23,data_csv_dec23])
x_test_float = pd.DataFrame(scaler.transform(df_testset), columns=df_testset.columns,index=df_testset.index)

x_train_float= x_train_balanced.copy()
x_train_float.describe()

Unnamed: 0,High-pressure,Low-pressure,Speed,Speed_order3,High-pressure_order3,Low-pressure_order3,Diff-pressure_order3,Speed_order5,High-pressure_order5,Low-pressure_order5,Diff-pressure_order5,Diff-pressure
count,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0
mean,0.332104,0.568258,0.436464,0.437485,0.333488,0.574119,0.240925,0.435835,0.335079,0.578888,0.242838,0.235758
std,0.284168,0.319134,0.482399,0.478376,0.284036,0.321885,0.299536,0.47349,0.283725,0.323944,0.301045,0.294641
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.196146,0.484783,0.0,0.0,0.201605,0.490399,0.058445,0.0,0.202642,0.494607,0.057998,0.057335
50%,0.247804,0.675533,0.0,0.0,0.24971,0.682266,0.084011,0.0,0.252483,0.687964,0.090356,0.075673
75%,0.410463,0.834891,0.987706,0.990454,0.413747,0.842908,0.243192,0.99052,0.418893,0.849619,0.254031,0.234625
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## xtree internal-state classifiers

In [None]:
from emlearn import convert

classifier="xtree"
seed=0
dir_model_pred = "./results/approach2/"

### float

In [None]:
# train a state classifier using all data from DS1 and test it on DS2
clf = train_state_supervised_classifier(classifier,x_train_float, y_train_balanced,seed)

#test state-cycles
y_predict = clf.predict(x_test_float)
y_pred_smoothed = smooth_labels(y_predict,3)     #apply 3rd median filter
y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
df_temp=df_testset.copy()
df_temp["recognized_label"]=y_recognized
df_temp=df_temp.reset_index()

#classify duty-cycle
df_recognized_states_jun23 = create_segments_state(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_temp)
df_recognized_states_aug23 = create_segments_state(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_temp)
df_recognized_states_okt23 = create_segments_state(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_temp)
df_recognized_states_dec23 = create_segments_state(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_temp)

folder_path = dir_model_pred+"MCU/"+classifier+"/float"
os.makedirs(folder_path, exist_ok=True)

if flag_save_results:
    create_reference_label_file(folder_path+"/jun23_state.txt",df_recognized_states_jun23)
    create_reference_label_file(folder_path+"/aug23_state.txt",df_recognized_states_aug23)
    create_reference_label_file(folder_path+"/okt23_state.txt",df_recognized_states_okt23)
    create_reference_label_file(folder_path+"/dec23_state.txt",df_recognized_states_dec23)        

    cmodel = convert(clf, method='inline',dtype='float')
    cmodel.save(file = "./results/c_code/approach2/xtree_float.h", name='clf')

Remove in the header file the array "static const EmlTreesNode clf_nodes[]", which is not used.

In [17]:
## apply threshold and classify duty-cycles

#import classified state_labels
aux = df_timestamps(pd.read_csv(folder_path+"/jun23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
data_csv_jun23["recognized_label"] = ndarray_labels(data_csv_jun23.index[0],data_csv_jun23.index[-1],aux,downsampled_freq)
aux = df_timestamps(pd.read_csv(folder_path+"/aug23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
data_csv_aug23["recognized_label"] = ndarray_labels(data_csv_aug23.index[0],data_csv_aug23.index[-1],aux,downsampled_freq)
aux = df_timestamps(pd.read_csv(folder_path+"/okt23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
data_csv_okt23["recognized_label"] = ndarray_labels(data_csv_okt23.index[0],data_csv_okt23.index[-1],aux,downsampled_freq)
aux = df_timestamps(pd.read_csv(folder_path+"/dec23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
data_csv_dec23["recognized_label"] = ndarray_labels(data_csv_dec23.index[0],data_csv_dec23.index[-1],aux,downsampled_freq)

#form testset
df_tmp=x_test_float.copy()
df_tmp["recognized_label"] = pd.concat([data_csv_jun23["recognized_label"], data_csv_aug23["recognized_label"],
                                        data_csv_okt23["recognized_label"],data_csv_dec23["recognized_label"]])

#apply detection threshold
df_tmp.reset_index(inplace=True)
df_tmp["detected_cycles"]='No_cycle'
threshold_speed = 2.5
threshold_speed_normalized = (threshold_speed-scaler.data_min_[2])/scaler.data_range_[2]
# threshold_speed_quant = (threshold_speed_normalized*(2**32 - 1)).astype('float')
df_tmp.loc[df_tmp['Speed_order3'] > threshold_speed_normalized ,'detected_cycles'] = 'Cycle'

#classify duty_cycle
apply_heuristic_rules(df_tmp)

#save results
df_recognized_cycles_jun23 = create_segments_cycle_classified(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_testset=df_tmp)
df_recognized_cycles_aug23 = create_segments_cycle_classified(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_testset=df_tmp)
df_recognized_cycles_okt23 = create_segments_cycle_classified(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_testset=df_tmp)
df_recognized_cycles_dec23 = create_segments_cycle_classified(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_testset=df_tmp)


if flag_save_results:
    create_reference_label_file(folder_path+"/jun23_cycle.txt",df_recognized_cycles_jun23)
    create_reference_label_file(folder_path+"/aug23_cycle.txt",df_recognized_cycles_aug23)
    create_reference_label_file(folder_path+"/okt23_cycle.txt",df_recognized_cycles_okt23)
    create_reference_label_file(folder_path+"/dec23_cycle.txt",df_recognized_cycles_dec23)

### uint8

In [18]:
# data quantization
x_train_uint8 = pd.DataFrame((x_train_balanced*(2**8 - 1)).astype('uint8'), columns=x.columns)
x_test_uint8 = pd.DataFrame((x_test_float*(2**8 - 1)).astype('uint8'), columns=x.columns)

x_train_uint8.describe()

Unnamed: 0,High-pressure,Low-pressure,Speed,Speed_order3,High-pressure_order3,Low-pressure_order3,Diff-pressure_order3,Speed_order5,High-pressure_order5,Low-pressure_order5,Diff-pressure_order5,Diff-pressure
count,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0,99127.0
mean,84.297366,144.528948,111.096977,111.352931,84.636567,145.991808,60.878641,110.926468,85.040251,147.222704,61.413308,59.544251
std,72.33871,81.205404,122.810907,121.785839,72.311531,81.895703,76.427025,120.535658,72.231558,82.425757,76.771326,75.186669
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50.0,123.0,0.0,0.0,51.0,125.0,14.0,0.0,51.0,126.0,14.0,14.0
50%,63.0,172.0,0.0,0.0,63.0,173.0,21.0,0.0,64.0,175.0,23.0,19.0
75%,104.0,212.0,251.0,252.0,105.0,214.0,62.0,252.0,106.0,216.0,64.0,59.0
max,255.0,255.0,255.0,255.0,255.0,255.0,254.0,254.0,255.0,255.0,255.0,254.0


In [None]:
clf = train_state_supervised_classifier(classifier,x_train_uint8, y_train_balanced,seed)

#test state-cycles
y_predict = clf.predict(x_test_uint8)
y_pred_smoothed = smooth_labels(y_predict,3)     #apply 3rd median filter
y_recognized=le.inverse_transform(y_pred_smoothed.astype(int))
df_temp=df_testset.copy()
df_temp["recognized_label"]=y_recognized
df_temp=df_temp.reset_index()

#classify duty-cycle
df_recognized_states_jun23 = create_segments_state(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_temp)
df_recognized_states_aug23 = create_segments_state(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_temp)
df_recognized_states_okt23 = create_segments_state(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_temp)
df_recognized_states_dec23 = create_segments_state(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_temp)

folder_path = dir_model_pred+ "/MCU/"+classifier+"/uint8"
os.makedirs(folder_path, exist_ok=True)

if flag_save_results:
    create_reference_label_file(folder_path+"/jun23_state.txt",df_recognized_states_jun23)
    create_reference_label_file(folder_path+"/aug23_state.txt",df_recognized_states_aug23)
    create_reference_label_file(folder_path+"/okt23_state.txt",df_recognized_states_okt23)
    create_reference_label_file(folder_path+"/dec23_state.txt",df_recognized_states_dec23)        

    cmodel = convert(clf, method='inline',dtype='uint8_t')
    cmodel.save(file= "./results/c_code/approach2/xtree_uint8.h", name='clf')

In [20]:
## apply threshold and classify duty-cycles

#import classified state_labels
aux = df_timestamps(pd.read_csv(folder_path+"/jun23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
data_csv_jun23["recognized_label"] = ndarray_labels(data_csv_jun23.index[0],data_csv_jun23.index[-1],aux,downsampled_freq)
aux = df_timestamps(pd.read_csv(folder_path+"/aug23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
data_csv_aug23["recognized_label"] = ndarray_labels(data_csv_aug23.index[0],data_csv_aug23.index[-1],aux,downsampled_freq)
aux = df_timestamps(pd.read_csv(folder_path+"/okt23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
data_csv_okt23["recognized_label"] = ndarray_labels(data_csv_okt23.index[0],data_csv_okt23.index[-1],aux,downsampled_freq)
aux = df_timestamps(pd.read_csv(folder_path+"/dec23_state.txt",sep='\t',names=["Time(Seconds)","Length(Seconds)","Label(string)"]))
data_csv_dec23["recognized_label"] = ndarray_labels(data_csv_dec23.index[0],data_csv_dec23.index[-1],aux,downsampled_freq)

#form testset
df_tmp=x_test_uint8.copy()
df_tmp["recognized_label"] = pd.concat([data_csv_jun23["recognized_label"], data_csv_aug23["recognized_label"],
                                        data_csv_okt23["recognized_label"],data_csv_dec23["recognized_label"]])

#apply detection threshold
df_tmp.reset_index(inplace=True)
df_tmp["detected_cycles"]='No_cycle'
threshold_speed = 2.5
threshold_speed_normalized = (threshold_speed-scaler.data_min_[2])/scaler.data_range_[2]
threshold_speed_quant = (threshold_speed_normalized*(2**8 - 1)).astype('float')
df_tmp.loc[df_tmp['Speed_order3'] > threshold_speed_quant ,'detected_cycles'] = 'Cycle'

#classify duty_cycle
apply_heuristic_rules(df_tmp)

#save results
df_recognized_cycles_jun23 = create_segments_cycle_classified(datetime.datetime(2023, 6, 1),datetime.datetime(2023, 7, 1),df_testset=df_tmp)
df_recognized_cycles_aug23 = create_segments_cycle_classified(datetime.datetime(2023, 8, 1),datetime.datetime(2023, 9, 1),df_testset=df_tmp)
df_recognized_cycles_okt23 = create_segments_cycle_classified(datetime.datetime(2023, 10, 1),datetime.datetime(2023, 11, 1),df_testset=df_tmp)
df_recognized_cycles_dec23 = create_segments_cycle_classified(datetime.datetime(2023, 12, 1),datetime.datetime(2024, 1, 1),df_testset=df_tmp)

if flag_save_results:
    create_reference_label_file(folder_path+"/jun23_cycle.txt",df_recognized_cycles_jun23)
    create_reference_label_file(folder_path+"/aug23_cycle.txt",df_recognized_cycles_aug23)
    create_reference_label_file(folder_path+"/okt23_cycle.txt",df_recognized_cycles_okt23)
    create_reference_label_file(folder_path+"/dec23_cycle.txt",df_recognized_cycles_dec23)

## performance

In [None]:
df_results = pd.DataFrame(columns=['quantization','state_classifier', "detection mean",
                                    'Abnormal mean F1-score','Normal mean F1-score','Overall mean F1-score'])

dir = './results/reference_cycle_labels/'
reference_path = dir
collar = 202.75

classifiers=["xtree"]
quantizations=["float","uint8"]
for classifier_state in classifiers:
    for q in quantizations:
        result_path = dir_model_pred + 'MCU/'+classifier_state+"/"+q+"/"
        f1score_file, precision_file, recall_file, f1score_abnormal, precision_abnormal, recall_abnormal , f1score_normal, precision_normal, recall_normal= compute_classification_sedeval(reference_path,result_path,collar)
        detection_file_overall = compute_detection_sedeval(reference_path,result_path,collar)

        print("---------- "+classifier_state+ " - "+q+" ----------")
        print("DETECTION: "+ str(detection_file_overall*100) )
        print("ABNORMAL:")
        print("F1-score: "+ str(f1score_abnormal*100) )
        print("Precision: "+ str(precision_abnormal*100) )
        print("Recall: "+ str(recall_abnormal*100) )
        print("NORMAL:")
        print("F1-score: "+ str(f1score_normal*100) )
        print("Precision: "+ str(precision_normal*100) )
        print("Recall: "+ str(recall_normal*100) )
        print("OVERALL:")
        print("F1-score: "+ str(f1score_file*100)  )
        print("Precision: "+ str(precision_file*100) )
        print("Recall: "+ str(recall_file*100) )

        dflocal = pd.DataFrame({'quantization':q,
                                'state_classifier':classifier_state, 
                                "detection mean": np.mean(detection_file_overall)*100,
                                'Abnormal mean F1-score':np.mean(f1score_abnormal)*100,
                                'Normal mean F1-score':np.mean(f1score_normal)*100,
                                'Overall mean F1-score':np.mean(f1score_file)*100},index=[0])
            
        df_results = pd.concat([df_results, dflocal], ignore_index=True)

df_results.to_csv(dir_model_pred + 'MCU/results_mcu.csv',index=False)

---------- xtree - float ----------
DETECTION: 99.16317991631799
ABNORMAL:
F1-score: 64.36781609195403
Precision: 53.84615384615385
Recall: 80.0
NORMAL:
F1-score: 87.76978417266189
Precision: 94.72049689440993
Recall: 81.76943699731903
OVERALL:
F1-score: 81.3807531380753
Precision: 81.3807531380753
Recall: 81.3807531380753
---------- xtree - uint8 ----------
DETECTION: 99.16317991631799
ABNORMAL:
F1-score: 64.88549618320612
Precision: 54.14012738853503
Recall: 80.95238095238095
NORMAL:
F1-score: 87.89625360230548
Precision: 95.01557632398755
Recall: 81.76943699731903
OVERALL:
F1-score: 81.58995815899581
Precision: 81.58995815899581
Recall: 81.58995815899581
