# Functions to Read & Sample Data

In [1]:
# Load packages
import os
import pandas as pd
import numpy as np
import patoolib
import pathlib
from tqdm import tqdm
from scipy import signal

### Decompress data sets

In [2]:
# Set filepaths for 3 bearing test data sets
dir_1t = os.getcwd() + '\\Data\\IMS\\1st_test'
dir_2t = os.getcwd() + '\\Data\\IMS\\2nd_test'
dir_3t = os.getcwd() + '\\Data\\IMS\\3rd_test'

In [3]:
# Check if files have already been un-rar and un-zipped, else do so:
for dir in ['1st_test','2nd_test','3rd_test']:
    if dir in os.listdir(os.getcwd() + '/Data/IMS'):
        pass
    elif (dir + '.rar') in os.listdir(os.getcwd() + '\\Data\\IMS'):
        #    if file.endswith(".rar"):
        #    #print(os.path.join(os.getcwd() + '\\Data\\IMS', file))
        print(dir)

In [4]:
# Decompress (Un-RAR) data
for test in [dir_1t, dir_2t, dir_3t]:
    # Skip if already done
    if os.path.exists(test):
        pass
    # Extract RAR file
    else:
        patoolib.extract_archive(test + '.rar', outdir=test)

### Read data

In [5]:
# Define start times of each experiment (Test 1, Test 2, Test 3)
dset = [1, 2, 3]
start_ts = [pd.to_datetime("2003-10-22 12:06:24"),
            pd.to_datetime("2004-02-12 10:32:39"),
            pd.to_datetime("2004-03-04 09:27:46")]
dir = [dir_1t,
       dir_2t,
       dir_3t + '\\4th_test\\txt']
colnames = [['b1x', 'b1y', 'b2x', 'b2y', 'b3x', 'b3y', 'b4x', 'b4y', 'time', 'measurement_id','counter'],
            ['b1x', 'b2x', 'b3x', 'b4x', 'time', 'measurement_id','counter'],
            ['b1x', 'b2x', 'b3x', 'b4x', 'time', 'measurement_id','counter']]

In [6]:
metadata = pd.DataFrame({'dset': dset, 'start_ts': start_ts, 'dir': dir, 'colnames': colnames} )
metadata

Unnamed: 0,dset,start_ts,dir,colnames
0,1,2003-10-22 12:06:24,C:\Users\nadav.rindler\OneDrive - American Red...,"[b1x, b1y, b2x, b2y, b3x, b3y, b4x, b4y, time,..."
1,2,2004-02-12 10:32:39,C:\Users\nadav.rindler\OneDrive - American Red...,"[b1x, b2x, b3x, b4x, time, measurement_id, cou..."
2,3,2004-03-04 09:27:46,C:\Users\nadav.rindler\OneDrive - American Red...,"[b1x, b2x, b3x, b4x, time, measurement_id, cou..."


In [7]:
metadata.to_csv('metadata.csv', header=True, index=False)

In [8]:
# Read data
    # Source: https://www.kaggle.com/code/andersgb/nasa-bearing-dataset-outlier-detection

def read_dataset(data_dir, first_ts, colnames, rate=20480):
    all_dfs = []
    for file_counter, f in enumerate(tqdm(sorted(pathlib.Path(data_dir).iterdir()))):
        df = pd.read_csv(f, sep="\t", header=None, dtype=np.float32)#.rename(columns=colnames)
        ts = pd.to_datetime(f.name, format="%Y.%m.%d.%H.%M.%S")
        measurement_delta = (ts - first_ts).total_seconds()
        step_s = 1 / rate  # 20 kHz sampling -- but these are 1 second snapshots! So appears to be 20,480 Hz
        df["time"] = measurement_delta + np.arange(len(df)) * step_s
        df["measurement_id"] = file_counter
        df["measurement_id"] = df["measurement_id"].astype(np.uint32)
        if len(df) != rate:
            raise RuntimeError(f"Unexpected file length {len(df)} in {f}")
        all_dfs.append(df)
    df_out = pd.concat(all_dfs, ignore_index=True)
    df_out["counter"] = df_out.groupby('measurement_id').cumcount()
    df_out.columns = colnames
    return df_out

In [9]:
test_1 = read_dataset(data_dir=dir[0], first_ts=start_ts[0], colnames=colnames[0], rate=20480)

100%|██████████████████████████████████████████████████████████████████████████████| 2156/2156 [01:26<00:00, 25.07it/s]


In [10]:
test_1.shape

(44154880, 11)

In [11]:
test_1.shape[0]/20480 # number of observations @ 20,480 Hz

2156.0

### Down-sample data with filter

In [12]:
# Downsample the signal after applying an anti-aliasing filter.
    # Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.decimate.html
        # by default, order 8 Chebyshev type I filter is used if ftype is 'iir'. HOWEVER this results in all 'nan' values
        # 30 point FIR filter with Hamming window is used if ftype is ‘fir’.
    # Source: https://en.wikipedia.org/wiki/Anti-aliasing_filter

def down_sample(df, col, q):  
    arr = signal.decimate(np.array(df[col]), q, ftype='fir') 
    return arr

In [13]:
test_1_ds = down_sample(test_1,'b1x',10)

In [14]:
test_1_ds.shape

(4415488,)

In [15]:
test_1_ds[0:10]

array([-0.09390984, -0.10298537, -0.12269048, -0.05235113, -0.09705996,
       -0.09167194, -0.12314022, -0.08880151, -0.12440187, -0.11103717],
      dtype=float32)