
# Generacion de Dataset


Descargar el dataset **NASA Bearings** https://www.kaggle.com/datasets/vinayak123tyagi/bearing-dataset. 




# Importar librerías

Importar aquellas librerías que serán utilizadas en el trabajo.

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import math
from cmath import sqrt
from fileinput import filename
import os
from os.path import isfile, join
from matplotlib.pyplot import axis
from numpy import divide


### Definición de funciones generadoras de features

In [2]:
# Define feature generating functions

# onlyfiles = [f for f in listdir(PATH_EXPERIMENT) if isfile(join(PATH_EXPERIMENT, f))]
single_features = {'aMean': True, 'std': True, 'irq': True, 'skew': True, 'kurtosis': True, 'f0': True, 'Pf0': True, 'Xrms': True, 'zeroX': True, 'p2p': True, 'crest': True, 'clearance': True, 'shape': True, 'impulse': True}    # List of features computed for single bearings
multi_features = {'covar': True}   # List of features computed between bearings
n_bearings = 4  # Number of bearings
df_columns = []

## Create list of feature names
# Single channel
for i in range(n_bearings):
    for key in single_features.keys():
        if(single_features[key] == True):
            df_columns.append(key+'_'+str(i))
# Multi channel
for i in range(n_bearings):
    for j in range(n_bearings):
        if(i >= j):
            continue
        else:
            for key in multi_features.keys():
                if(multi_features[key] == True):
                    df_columns.append(key+'_'+str(i)+str(j))


print('Columns of the dataset: ', df_columns)

def GetIQR(df):
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    iqr = q3 - q1
    return np.array(iqr).flatten().reshape(1,4)

# @param ignore_dc If True, the functions skips the first harmonic if it is DC frequency.
# @return A list of 4 sublists, where the top n_harmonics for each bearing are stored in each sublist. Each harmonic is stored as a tuple (magnitude, frequency)
def GetFundFreq(df, Fs, n_harmonics = 1, ignore_dc = False):
    signal = np.array(df)
    fft_abs = np.abs(np.fft.rfft(signal, axis=0))
    freq = np.fft.rfftfreq(signal.shape[0], d=1/Fs)
    harmonics = [] 
    for column in fft_abs.T:
        # Get top N frequencies
        fsorted = sorted(zip(column, freq), reverse=True)
        skip = 0
        if(ignore_dc and fsorted[0][1] == 0.0):
            skip = 1
        harmonics.append(fsorted[skip:n_harmonics+skip])
    return harmonics

def GetZeroCrossings(df_in):
    crossings = []
    for col in df_in.columns:
        zero_crossings = np.where(np.diff(np.signbit(df_in[col])))[0]
        crossings.append(len(zero_crossings))
    return crossings

def GetRMS(df_in):
    pwr = []
    length = df_in.shape[0]
    for col in df_in.columns:
        pwr.append(np.abs(np.sqrt(np.sum(np.square(df_in[col]))/length)))
    return pwr


def GetCovar(df_in):
    covm = np.cov(df_in, rowvar=False)
    # print('cov_01 | cov_02 | cov_03 | cov_12 | cov_13 | cov_23')
    # print(covm[np.triu_indices(4,k=1)])
    return covm[np.triu_indices(4,k=1)]

def AddNewRow(df_out, df_in):
    f_vals = np.empty((1,4))
    abs_mean = np.array(df_in.abs().mean(axis=0)).flatten()
    X_rms = GetRMS(df_in)    # Used for crest factor
    # Get per bearing values
    if(single_features['aMean']):   # Compute absolute mean for each channel
        f_vals = np.vstack((f_vals, abs_mean))  # mean
    if(single_features['std']):
        f_vals = np.vstack((f_vals, np.array(df_in.std(axis=0)).flatten()))  # std
    if(single_features['irq']):
        f_vals = np.vstack((f_vals, GetIQR(df_in))) # irq
    if(single_features['skew']):
        f_vals = np.vstack((f_vals, skew(df_in, axis=0)))
    if(single_features['kurtosis']):
        f_vals = np.vstack((f_vals, kurtosis(df_in, axis=0)))
    if(single_features['f0']):
        freqs = GetFundFreq(df_in, 20000, n_harmonics=1, ignore_dc=True)
        f_vals = np.vstack((f_vals, [i[0][1] for i in freqs]))
        if(single_features['Pf0']):        # CHECK if getting the magnitude is ok. Should it be the magnitude squared??
            f_vals = np.vstack((f_vals, [20*math.log10(i[0][0]) for i in freqs]))
    if(single_features['Xrms']):
        f_vals = np.vstack((f_vals, X_rms))
    if(single_features['zeroX']):
        f_vals = np.vstack((f_vals, GetZeroCrossings(df_in)))
    if(single_features['p2p']):
        peak_to_peak = np.array(np.abs(np.max(df_in, axis=0)) + np.abs(np.min(df_in, axis=0)))
        f_vals = np.vstack((f_vals, peak_to_peak))
    if(single_features['crest']):
        crest = np.array(np.divide(np.max(df_in, axis=0),X_rms))
        f_vals = np.vstack((f_vals, crest))
    if(single_features['clearance']):
        clearance = np.array(np.divide(np.max(df_in, axis=0), np.square(np.sum(np.sqrt(np.abs(df_in)), axis=0)/df_in.shape[0])))
        f_vals = np.vstack((f_vals, clearance))
    if(single_features['shape']):
        shape = np.array(np.divide(X_rms, abs_mean))
        f_vals = np.vstack((f_vals, shape))
    if(single_features['impulse']):
        impulse = np.array(np.divide(np.max(df_in, axis=0), abs_mean))
        f_vals = np.vstack((f_vals, impulse))


    f_vals = f_vals[1:] # Drop first row which is dummy
    f_vals = f_vals.flatten(order='F')  # Merge rows into single row

    # Now append crossfeatures
    if(multi_features['covar']):
        f_vals = np.append(f_vals, GetCovar(df_in))

    new_row = pd.DataFrame(f_vals.reshape(1,len(df_columns)),columns=df_columns)
    df_out = pd.concat([df_out, new_row], axis=0)
    
    return df_out




Columns of the dataset:  ['aMean_0', 'std_0', 'irq_0', 'skew_0', 'kurtosis_0', 'f0_0', 'Pf0_0', 'Xrms_0', 'zeroX_0', 'p2p_0', 'crest_0', 'clearance_0', 'shape_0', 'impulse_0', 'aMean_1', 'std_1', 'irq_1', 'skew_1', 'kurtosis_1', 'f0_1', 'Pf0_1', 'Xrms_1', 'zeroX_1', 'p2p_1', 'crest_1', 'clearance_1', 'shape_1', 'impulse_1', 'aMean_2', 'std_2', 'irq_2', 'skew_2', 'kurtosis_2', 'f0_2', 'Pf0_2', 'Xrms_2', 'zeroX_2', 'p2p_2', 'crest_2', 'clearance_2', 'shape_2', 'impulse_2', 'aMean_3', 'std_3', 'irq_3', 'skew_3', 'kurtosis_3', 'f0_3', 'Pf0_3', 'Xrms_3', 'zeroX_3', 'p2p_3', 'crest_3', 'clearance_3', 'shape_3', 'impulse_3', 'covar_01', 'covar_02', 'covar_03', 'covar_12', 'covar_13', 'covar_23']


### Cargo archivos del experimento de a uno y computo los features

In [4]:
PATH_EXPERIMENT_2 = "./archive/2nd_test/2nd_test/"
PATH_EXPERIMENT_3 = "./archive/3rd_test/4th_test/txt/"

PATH_EXPERIMENT = PATH_EXPERIMENT_2

files_to_process = 20 # if -1, process all files in path folder

file_names = [x for x in os.listdir(PATH_EXPERIMENT)]# if x.endswith('.39')]
df_in = None
df_out = pd.DataFrame(columns=df_columns)#, 'std_0', 'std_1', 'std_2', 'std_3'])
if(files_to_process == -1):
    files_to_process = len(file_names)

for i in range(files_to_process):
    pathText = PATH_EXPERIMENT + '\\' + file_names[i]
    df_in = pd.read_csv(pathText, delimiter='\t', header=None)
    df_out = AddNewRow(df_out, df_in)

df_out=df_out.reset_index(inplace=False, drop=True)
print(df_out.head())
print("shape:",df_out.shape)



# Forma de nuestro dataset:
# #sample | mean_0 | std_0 | .... | mean_1 | std_1 | ..... | mean_4 | std_4 | ... ||| failure_0 | failure_1 | ... | failure_3
                                                                #                       None    | outer_race| ... | None

    aMean_0     std_0  irq_0    skew_0 kurtosis_0        f0_0      Pf0_0  \
0  0.058332  0.073477  0.096  0.083993   0.628763  985.351562  52.733122   
1  0.058997   0.07534  0.097  0.052142   0.648291  985.351562  51.201049   
2  0.060239  0.076191    0.1  0.032808   0.513475  985.351562  50.779361   
3  0.061453  0.078693    0.1  0.041486   1.157953  985.351562  51.079815   
4  0.061361  0.078439  0.103  0.028224   0.603177  985.351562  50.279226   

     Xrms_0 zeroX_0  p2p_0  ...   crest_3 clearance_3   shape_3  impulse_3  \
0  0.074179  6593.0   0.84  ...  3.567239    5.323107  1.256323   4.481604   
1  0.075382  6867.0  0.757  ...  4.438435     6.64634  1.259563   5.590488   
2   0.07623  6637.0  0.903  ...  6.305078    9.483004  1.263337   7.965439   
3  0.078724  6659.0  1.184  ...  9.030509   13.503735  1.260116  11.379491   
4  0.078474  6765.0  0.782  ...   4.29268    6.429704   1.25982   5.408004   

   covar_01  covar_02  covar_03  covar_12  covar_13  covar_23  
0  0.00139

### Agrego columna target
El usuario debe ingresar cuál rodamiento falló al final del experimento.

In [15]:
# Add target columns
from numpy import zeros

bearing_that_failed = 1 # Bearing: 1 - 4

results = np.array([zeros((df_out.shape[0])) for i in range(n_bearings)]).T
results[-1][bearing_that_failed-1] = 1  # Bearing that failed at the end
pd_results = pd.DataFrame(data=results, columns=['y0', 'y1', 'y2', 'y3'])
df_out = pd.concat([df_out, pd_results], axis=1)
print(df_out.tail())

      aMean_0     std_0  irq_0    skew_0 kurtosis_0         f0_0      Pf0_0  \
0    0.058332  0.073477  0.096  0.083993   0.628763   985.351562  52.733122   
1    0.058997   0.07534  0.097  0.052142   0.648291   985.351562  51.201049   
2    0.060239  0.076191    0.1  0.032808   0.513475   985.351562  50.779361   
3    0.061453  0.078693    0.1  0.041486   1.157953   985.351562  51.079815   
4    0.061361  0.078439  0.103  0.028224   0.603177   985.351562  50.279226   
..        ...       ...    ...       ...        ...          ...        ...   
979  0.453318  0.725014   0.63 -0.510556  12.577705    4398.4375  65.182241   
980  0.337575  0.462001  0.528 -0.325368   3.759972  4397.460938  62.634087   
981  0.351094  0.483844  0.542 -0.377095   4.891755  4846.679688   60.05091   
982  0.001857  0.000987    0.0  0.579698   3.637513     58.59375  15.223287   
983  0.001168     0.001  0.002  0.317032  -1.609774     58.59375  18.504699   

       Xrms_0 zeroX_0  p2p_0  ...  covar_01  covar_

### Exportar a CSV

In [17]:
# Save to CSV
PATH_DATASET = "./dataset02.csv"
df_out.to_csv(PATH_DATASET, index=False)