In [1]:
save_dataset = False

In [2]:
import numpy as np
from utils import dataProcessing as dataP
import json
import pandas as pd
from pathlib import Path
np.random.seed(1222024)

In [3]:
Path(f"./dataset/processed/").mkdir(parents=True, exist_ok=True)
classes_data = np.load('./dataset/raw/Classes.npy', allow_pickle=True)
n_samples = len(classes_data)
classes_k = ['Classe A','Classe B', 'Classe C','Classe D','Classe E']

# Composição do dataset
* treino: 70%, validação: 15%, teste: 15%
* Como o dataset é balanceado, vou manter essa característica em cada um dos conjuntos
    * Composição do dataset vai selecionar o mesmo número de elementos de cada classe para cada parte do dataset 

In [4]:
split_dist = [.7, .15, .15] #train, val, inference
n_per_class = int(n_samples/len(classes_k))
dataset_split_idxs = {}
first_i = 0
last_i = 0
for label, d in zip(['train', 'val', 'test'], split_dist):
    last_i = int(first_i+n_per_class*d)
    dataset_split_idxs[label] = [first_i, last_i]
    first_i = last_i

In [5]:
class_idxs = {}
for ck in classes_k:
    class_idxs[ck] = np.where((classes_data==ck).flatten())[0]
    np.random.shuffle(class_idxs[ck])

# Coleta das features

In [6]:
n_sensors = 3
sensor_data = []
for i in range(n_sensors):
    sensor_data.append(np.load(f'./dataset/raw/Dados_{i+1}.npy', allow_pickle=True))
    
    for j in range(n_samples):
        sensor_data[i][j] = dataP.fill_the_gaps(sensor_data[i][j])    

In [7]:
n = 200
freq = 10000
time = 1/freq;
rms_global = []
peak = []
peak2peak =[]
crista = []
fft_ys = []
for s, s_data in enumerate(sensor_data):
    # features
    rms = dataP.get_RMS(s_data, freq, n, n)
    rms_global.append(rms.flatten())
    peak.append(dataP.get_peak(s_data))
    crista.append(dataP.get_crista(peak[s], rms))
    peak2peak.append(dataP.get_peak2peak(s_data))
    # yfs
    _, yfs = dataP.apply_fft(s_data, freq, n)
    max_v = np.max(np.max(yfs))
    fft_ys.append(yfs/max_v)


## Normalização

In [8]:
metrics_map = {
    "RMS": rms_global, 
    "Peak": peak, 
    "Peak2Peak": peak2peak, 
    "Crista": crista
}
metrics_norm_map = {
    "RMS": [], 
    "Peak": [], 
    "Peak2Peak": [], 
    "Crista": []
}
ft_limits = json.load(open('./norm_feat_params.json'))
for m in metrics_map:
    results = []
    for s, s_data in enumerate(metrics_map[m]):
        min_v, max_v = ft_limits[m][f's{s}']
        result = (metrics_map[m][s]-min_v)/(max_v-min_v)
        metrics_norm_map[m].append(result)


# Separação das samples

In [9]:
schema={'Classe': 'string'}
for s in range(n_sensors):
    schema[f's{s}_RMS'] = 'float64'
    schema[f's{s}_Peak'] = 'float64'
    schema[f's{s}_Peak2Peak'] = 'float64'
    schema[f's{s}_Crista'] = 'float64'
for s in range(n_sensors):
    for idx in range(len(yfs[0])):
        schema[f's{s}_amp_{idx}'] = 'float64'

In [10]:
for split_key in ['val', 'test', 'train']:#[]
    split_data_idxs = []
    i, j = dataset_split_idxs[split_key]
    for class_k, idxs in class_idxs.items():
        split_data_idxs.extend(idxs[i:j])
    df_split = pd.DataFrame(columns=schema.keys(), index=[i for i in range(len(split_data_idxs))]).astype(schema)
    df_split.loc[:, ['Classe']] = classes_data[split_data_idxs]
    for s in range(n_sensors):
        for m, m_data in metrics_norm_map.items():
            col_data = m_data[s][split_data_idxs]
            col_label = f's{s}_{m}'
            df_split.loc[:, [col_label]] = col_data

        fft_vals = fft_ys[s][:][split_data_idxs]
        for idx in range(len(fft_vals[0])):
            col_label = f's{s}_amp_{idx}'
            df_split.loc[:, [col_label]] = fft_vals[:,idx]
    if save_dataset:
        df_split.to_csv(f"./dataset/processed/{split_key}_data.csv", sep=',', index=False, encoding='utf-8')