## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import path
import sklearn
# %matplotlib qt

## Variables to set

In [None]:
DATASET = "32FINAL.csv"
DATAPATH = path.join("..", "capture", "processed", DATASET)
SAMPLES_TO_VIEW = [2,240,500,780]
NORMALIZE_DATASET = False

## Load dataset

In [None]:
dataset = pd.read_csv(DATAPATH)
# dataset
unique, counts = np.unique(dataset['Class'], return_counts=True)
print(dict(zip(unique, counts)))

## Normalize dataset to eliminate negative values

In [None]:
if NORMALIZE_DATASET:
    # Drop class column and headers
    dataset_array = dataset.iloc[:,:-1].to_numpy()
    dataset_classes = np.expand_dims(dataset.iloc[:,-1].to_numpy(), axis=1)
    dataset_columns = list(dataset.columns)

    ds_min = dataset_array.min()
    ds_max = dataset_array.max()
    print("Current minimum: {}, Current maximum: {}".format(ds_min, ds_max))

    new_min = 0
    new_max = ds_max + ((-1)*ds_min) # new max = max + min, 
    # i.e. it is a translation of scale from range -x to +y to range 0 to +y'
    print("New minimum: {}, New maximum: {}".format(new_min, new_max))

    new_array = np.zeros((dataset_array.shape[0], dataset_array.shape[1]))

    # Normalization
    new_array = (((dataset_array - ds_min) / (ds_max - ds_min)) * new_max).astype("int64")

    new_array = np.concatenate((new_array, dataset_classes), axis=1)

    # Convert to DataFrame again
    dataset = pd.DataFrame(new_array, columns=dataset_columns, index=None)
    # dataset
    dataset.to_csv(path.join("..","results","extraction","32FINAL_NORM.csv"), sep=",", index=False)

In [None]:
X = [i for i in range(100)]
print(X)

## Generate a figure with plots of the initial samples

In [None]:
fig, axs = plt.subplots(len(SAMPLES_TO_VIEW), 7, sharex='col', figsize=(21,len(SAMPLES_TO_VIEW)*3))
for i in range(len(SAMPLES_TO_VIEW)):
    for j in range(7):
        y = dataset.iloc[i,j*100:(j+1)*100]
        axs[i,j].plot(X, y)
        if i == 0:
            axs[i,j].set_title(dataset.columns[j*100][:-1], fontsize=15)

fig.suptitle("Exemplo com "+str(len(SAMPLES_TO_VIEW))+" amostras", fontsize=25)            
# plt.show()
fig.savefig(path.join("..","results","extraction","plot",DATASET[:-4]+".PNG"), dpi=300)

# Feature Extraction
## Import extraction libraries
Using code from Navar M. M. Nascimento -> https://github.com/navarmn/feature_extraction_signal for Fourier and HOS extraction.
Using code from Geraldo Luís Bezerra Ramalho -> Private's Google Colaboratory for extraction using Structural Cooccurrence Matrix - SCM method.

In [None]:
from src.feature_extraction import Fourier, HOS
from src.SCM import SCM

In [None]:
fe_fourier25 = Fourier(fundamental=25.0, fs=100.0, harmonics=(1, 10, 20, 30))
fe_fourier30 = Fourier(fundamental=30.0, fs=100.0, harmonics=(1, 10, 20, 30))
fe_fourier35 = Fourier(fundamental=35.0, fs=100.0, harmonics=(1, 10, 20, 30))
fe_HOS = HOS()
fe_SCM = SCM(f=np.array([0]), g=np.array([0]), minmax=[0,0], NL=8, d=[0, 0], verbose=False)

Our dataset is composed by 1100 sensor captures (dataset rows). Each one has 100 samples per measures (AcX, AcY, AcZ, GyX, GyY, GyZ, Tmp). The Tmp values will be despized for this work. So, we need to extract each measure's vector and generate a new dataset with features.

In [None]:
# excluding Tmp columns
tmp_head = []
for i in range(100):
    tmp = "Tmp"+str(i)
    tmp_head.append(tmp)
if "Tmp0" in dataset.columns:
    dataset = dataset.drop(columns=tmp_head)
tmp_head.clear()
# dataset

import time

current_milli_time = lambda: int(round(time.time() * 1000))

## Fourier extraction
Fourier's extractor receive as parameters: fundamental frequency, sampling frequency and harmonics (multiples) of the input signal. Each class needs to be extracted for different configurations of the object, since classes differ themselves by fundamental frequency.

Each measure (AcX, AcY, AcZ, GyX, GyY and GyZ) are extracted separated, since they are different signals, with 100 values. After extraction, will be 4 attributes for each measure, resulting in a dataset with 1100 rows and 25 columns ((4 attributes X 6 measures) + Class).

In [None]:
def fourier_extraction(dataset, extractors, measures):
    print('Fourier: ')
    time_list = []
    
    out_data_list = []
    out_row_list = []

    for row in range(len(dataset.index)):
        dataset_row = dataset.iloc[row,:]
        class_num = dataset_row['Class']
        dataset_fourier = dataset_row.values[:-1]
        dataset_fourier = pd.DataFrame(np.reshape(dataset_fourier, (100,6), order='F'), columns=measures)
        out_row_list.clear()
        
        extract_start = current_milli_time()
        for measure in measures:
            out_fourier = extractors[class_num].transform(dataset_fourier[measure])
            out_row_list.append(out_fourier['features'])
        extract_time = current_milli_time() - extract_start
        time_list.append(extract_time)
        
        out_row_list.append([class_num])
        row_list = [item for sublist in out_row_list for item in sublist]
        out_data_list.append(row_list)

    out_head = []
    for measure in measures:
        for i in range(4):
            column = measure+str(i)
            out_head.append(column)
    out_head.append("Class")

    out_data_list = np.asarray(out_data_list)
    out_dataframe = pd.DataFrame(out_data_list, columns=out_head, index=None)
    out_dataframe = out_dataframe.astype({"Class": int})
    
    print("Amostras: ", len(time_list))
    time_mean = np.mean(time_list)
    print("Extraction time mean :", time_mean)
    time_std = np.std(time_list)
    print("Extraction time std: ", time_std)
    
    return out_dataframe

## HOS extraction
Higher-Order Statistics extractor do not receive parameters. All classes will be extracted in the same way. The resulting feature vector corresponds to four statistics of the input signal: rms, variance, skewness and the kurtosis.

Each measure (AcX, AcY, AcZ, GyX, GyY and GyZ) are extracted separated, since they are different signals, with 100 values. After extraction, will be 4 attributes for each measure, resulting in a dataset with 1100 rows and 25 columns ((4 attributes X 6 measures) + Class).

In [None]:
def HOS_extraction(dataset, extractor, measures):
    print('HOS: ')
    time_list = []    
    
    out_data_list = []
    out_row_list = []

    for row in range(len(dataset.index)):
        dataset_row = dataset.iloc[row,:]
        class_num = dataset_row['Class']
        dataset_HOS = dataset_row.values[:-1]
        dataset_HOS = pd.DataFrame(np.reshape(dataset_HOS, (100,6), order='F'), columns=measures)
        out_row_list.clear()
        
        extract_start = current_milli_time()
        for measure in measures:
            out_HOS = extractor.transform(dataset_HOS[measure])
            out_row_list.append(out_HOS['features'])
        extract_time = current_milli_time() - extract_start
        time_list.append(extract_time)
        
        out_row_list.append([class_num])
        row_list = [item for sublist in out_row_list for item in sublist]
        out_data_list.append(row_list)

    out_head = []
    for measure in measures:
        for i in range(4):
            column = measure+str(i)
            out_head.append(column)
    out_head.append("Class")

    out_data_list = np.asarray(out_data_list)
    out_dataframe = pd.DataFrame(out_data_list, columns=out_head, index=None)
    out_dataframe = out_dataframe.astype({"Class": int})
    
    print("Amostras: ", len(time_list))
    time_mean = np.mean(time_list)
    print("Extraction time mean :", time_mean)
    time_std = np.std(time_list)
    print("Extraction time std: ", time_std)
    
    return out_dataframe

## SCM extraction
Structural Cooccurrence Matrix is a analysis method that can be used as an extractor. All classes will be extracted in the same way. The resulting feature vector corresponds to eight features of the input signal, calculated from a matrix resulting of comparison between raw signal and the same signal, after a low-pass filter: COR, IDM, ENT, CSD, CSR, MDR, DKL and CAD.

Each measure (AcX, AcY, AcZ, GyX, GyY and GyZ) are extracted separated, since they are different signals, with 100 values. After extraction, will be 8 attributes for each measure, resulting in a dataset with 1100 rows and 49 columns ((8 attributes X 6 measures) + Class).

In [None]:
def SCM_extraction(dataset, extractor, signal_filter, measures):    
    print('SCM: ')
    time_list = []   
    
    out_data_list = []
    out_row_list = []
    
    for row in range(len(dataset.index)):
        dataset_row = dataset.iloc[row,:]
        class_num = dataset_row['Class']
        dataset_SCM = dataset_row.values[:-1]
        dataset_SCM = pd.DataFrame(np.reshape(dataset_SCM, (100,6), order='F'), columns=measures)
        out_row_list.clear()
        
        extract_start = current_milli_time()
        for measure in measures:
            signal_f = dataset_SCM[measure].to_numpy(dtype="int64")
            signal_g = dataset_SCM[measure].to_numpy(dtype="int64")
            signal_f = np.expand_dims(signal_f, axis=1)
            signal_g = np.expand_dims(signal_g, axis=1)
            signal_f = signal_f.T
            signal_g = signal_g.T
            extractor = SCM(f=signal_f, g=signal_g, minmax=[0,0], NL=8, d=[0, 0], verbose=False)
            extractor.function_k = signal_filter
            extractor.d = np.array([[0, 0]])
            extractor.compute_matrix()
            extractor.compute_attributes()
            out_row_list.append(extractor.attributes.values())
        extract_time = current_milli_time() - extract_start
        time_list.append(extract_time)    
        
        out_row_list.append([class_num])
        row_list = [item for sublist in out_row_list for item in sublist]
        out_data_list.append(row_list)

    out_head = []
    for measure in measures:
        for i in range(8):
            column = measure+str(i)
            out_head.append(column)
    out_head.append("Class")

    out_data_list = np.asarray(out_data_list)
    out_dataframe = pd.DataFrame(out_data_list, columns=out_head, index=None)
    out_dataframe = out_dataframe.astype({"Class": int})
    
    print("Amostras: ", len(time_list))
    time_mean = np.mean(time_list)
    print("Extraction time mean :", time_mean)
    time_std = np.std(time_list)
    print("Extraction time std: ", time_std)
    
    return out_dataframe

## Extraction general

After call all functions and generate dataframes with corresponding features, CSV's of each dataframe would be saved in destiny folder.

In [None]:
import scipy.ndimage

measures = ["AcX", "AcY", "AcZ", "GyX", "GyY", "GyZ"]
fourier_extractors = [fe_fourier25, fe_fourier25, fe_fourier25, fe_fourier30, fe_fourier30, fe_fourier35] # class 0 to 5

# user-defined k (low-pass average filter)
def SCM_filter(s):
  kernel = np.ones((3,3))
  ret = scipy.ndimage.filters.convolve(s.astype(float), kernel, mode="reflect")/kernel.sum()
  return np.around(ret).astype(s.dtype)

# Extraction functions
fourier_dataframe = fourier_extraction(dataset, fourier_extractors, measures)
HOS_dataframe = HOS_extraction(dataset, fe_HOS, measures)
SCM_dataframe = SCM_extraction(dataset, fe_SCM, SCM_filter, measures)

# Save CSV's
fourier_dataframe.to_csv(path.join("..","results","extraction", DATASET[:-4]+"_Fourier.csv"), sep=",", index=False)
HOS_dataframe.to_csv(path.join("..","results","extraction", DATASET[:-4]+"_HOS.csv"), sep=",", index=False)
SCM_dataframe.to_csv(path.join("..","results","extraction", DATASET[:-4]+"_SCM.csv"), sep=",", index=False)

# Features plotting

Now, the features will be displayed, and the plot will be saved. That plot consists in a scatter matrix plot, which show a dispersion of elements, comparing pairs of features of a selected measure.

In [None]:
def discrete_cmap(N, base_cmap=None):
    base = plt.cm.get_cmap(base_cmap)
    color_list = base(np.linspace(0, 1, N))
    cmap_name = base.name + str(N)
    return base.from_list(cmap_name, color_list, N)

def plot_features(data_name, dataframe, classes, features, features_name, extractor_name):
    plt.rcParams["figure.subplot.right"] = .8

    fig = pd.plotting.scatter_matrix(dataframe[[f for f in features]], figsize=(24,13), 
                                     c=dataframe["Class"], label=classes, s=100, alpha=.8, 
                                     cmap=discrete_cmap(6, 'rainbow'))



    handles = [plt.plot([], [], color=discrete_cmap(6, 'rainbow')(i), ls="", marker=".", 
                        markersize=np.sqrt(50))[0] for i in range(6)]

    plt.legend(handles, classes, loc=(1.02,0))
    plt.suptitle(data_name + " - " + features_name + " - " + extractor_name, fontsize=24)
    plt.savefig(path.join("..","results","extraction","plot",data_name+"_"+features_name+"_"+extractor_name+".png"), 
                dpi=300)
    plt.clf()

## Plotting general

In [None]:
labels = ["25 NORMAL", "25 REVERSO", "25 VAZIO", "30 NORMAL", "30 VAZIO", "35 NORMAL"]
features_name = "GyZ"

features = []
for i in range(4):
    feature = features_name+str(i)
    features.append(feature)

SCM_features = []
for j in range(8):
    feature = features_name+str(j)
    SCM_features.append(feature)

plot_features(DATASET[:-4], fourier_dataframe, labels, features, features_name, "Fourier")
plot_features(DATASET[:-4], HOS_dataframe, labels, features, features_name, "HOS")
plot_features(DATASET[:-4], SCM_dataframe, labels, SCM_features, features_name, "SCM")

features.clear()
SCM_features.clear()