### In this file the data is preprocessed in order to be readable for the CNN

# Imports

In [15]:
#Base libraries
import os
import csv

#Important mathematical libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#Wavelet transformation libraries
from ssqueezepy import cwt, icwt

#Machine learning libraries
from sklearn.model_selection import train_test_split

#Other
# tqdm makes loading bar
from tqdm import tqdm


# Functions and Classes

In [16]:
# Finding anomalies in one column
def find_column_anomaly(df, column):
    column_anomaly_upper = df[column] > df[column].mean()*1.2
    column_anomaly_lower = df[column] < df[column].mean()*0.8
    column_anomaly = column_anomaly_upper | column_anomaly_lower
    return column_anomaly.values

# Finding anomalies in multiple columns
def find_anomalies_std(df, columns):
    anomalies = []
    for column in columns:
        anomalies_array = find_column_anomaly(df, column)
        anomalies.append(anomalies_array)
    is_anomaly = [sum(i)/len(columns) for i in zip(*anomalies)]
    df["Anomaly"] = is_anomaly
    df["Anomaly"] = df["Anomaly"]> 0.7
    return df

# Creating frames
def get_sets(df, columns, step, row_length):
    rows_num = int(len(df)/step - (row_length/step-1))
    
    values = df[columns].values
    anomalies = df['Anomaly'].values
 
    X = np.zeros((rows_num, row_length,len(columns)))
    Y = np.zeros((rows_num, 1))
    
    for i in range(0, rows_num):
        first_element = step*i
        last_element = step*i+row_length
        X[i] = values[first_element:last_element]
        Y[i] = anomalies[first_element:last_element].sum()
    Y = np.where(Y > 0, 1, 0)
    return X, Y

# Creating wavelets
def wavelet_transformation(X, columns):
    X_shape  = cwt(X[0][:,0], 'morlet')[0].shape
    data = np.zeros((len(X), X_shape[0], X_shape[1], len(columns)*2))
    counter = 0
    for row in tqdm(X, desc=f"Data creation progress..."):
        images = []
        for i in range(0, len(row[0])):
            Wx, scales = cwt(row[:, i], 'morlet')
            real = np.reshape(Wx.real, (Wx.shape[0], Wx.shape[1], 1))
            imag = np.reshape(Wx.imag, (Wx.shape[0], Wx.shape[1], 1))
            image = np.concatenate([real, imag], axis=2)
            images.append(image)
        data[counter] = np.concatenate(images, axis=2)
        counter +=1
    return data


# Create directory if does not exist
def create_directory(dataset_name):
    dir_path = f'datasets/{dataset_name}/'
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

# Reading CSV

- Reads parameters directly into a data frame

In [17]:
#Thermal values
df = pd.read_csv('../data/thm/processed_thermal_data.csv')

# OR

#EPS values
#df = pd.read_csv('../data/eps/processed_test_power_measurements_2.csv')

# Parameter setup


### Reading existing columns

In [18]:
columns_list = list(df.columns)
for i, column in enumerate(columns_list[1:]):
    print(f"{i+1}) {column}")

1) created_on
2) MB_temp
3) BMB_temp
4) B1_temp
5) B2_temp
6) B1_temp_adj


### Setting variables

- __frame_size__ is the size of the data chunks, that the data is split into
- __step_size__ is the difference in time steps between the start of two consecutive frames
- __test_size__ is relation of data points used for training to total number of data points
- __columns__ is the names of columns that should be processed

In [19]:
step = 20
row_length = 40
test_size = 0.6
columns = ["B1_temp"]

# Data preprocessing

In [21]:
# Limiting the size of the data set
df_new = df.iloc[-100000:,:]

# Spliting the data into predefined data chunks
X, Y =  get_sets(df_new, columns, step, row_length)

# Creating a wavelet transform of every data chunk
data = wavelet_transformation(X, columns)

# Creating training and testing data sets
x_train, x_test, y_train, y_test = train_test_split(data, Y, test_size=test_size, shuffle=False) 

Data creation progress...: 100%|██████████| 4999/4999 [00:41<00:00, 119.37it/s]


# Saving the data

In [22]:
# Naming the directory
dataset_name = f'{"&".join(columns)}-{row_length}L{step}S{int(test_size*100)}T'

# Creating directory if exists
create_directory(dataset_name)

# Saving
np.save(f'datasets/{dataset_name}/X.npy', X)
np.save(f'datasets/{dataset_name}/original_data.npy', df_new[columns+["Anomaly"]].values)
np.save(f'datasets/{dataset_name}/x_train.npy', x_train)
np.save(f'datasets/{dataset_name}/y_train.npy', y_train)
np.save(f'datasets/{dataset_name}/x_test.npy', x_test)
np.save(f'datasets/{dataset_name}/y_test.npy', y_test)

# Log
print("Save done")

Save done
