In [None]:
import pandas as pd
from collections import OrderedDict
import torch
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pymongo import MongoClient

In [114]:
fdi_db = MongoClient('localhost', 27017).fdi
detector_metadata = fdi_db.detector_metadata

In [115]:
meta = list(detector_metadata.find({}))
print(meta)

[{'_id': ObjectId('6240f18b08d0c9c605fbeba5'), 'threshold': 0.1923428960391967}]


In [116]:
'''
DATA REPRESENTATION

1 => SINGLE READ | 2 => ADD FEATURES | 3 => WINDOW TO FEATURES
'''
DATA_REPRESENTATION = 2

'''
DOWNSAMPLE FACTOR

1 => 10hz *original rate* | 2 => 5Hz | 5 => 2Hz | 10 => 1hz
'''
DOWNSAMPLE_FACTOR = 5

'''
WINDOWS LENGHT

* needs divisor by datapoints target
* considering downsample factor = 5

1 => WINDOW DISABLED | 2 => 1 second | 4 => 2 seconds | 10 => 5 seconds | 20 => 10 seconds | 200 => 100 seconds *full flight*
'''
WINDOW_LENGHT =  1

'''
LOSS FACTOR [0,1]

Ignores outliers in calculating the stats of losses in regenerated data.
'''
LOSS_FACTOR = .96

'''
TRAIN_SIZE [0,1]

Percentage of samples to be trained
'''
TRAIN_SIZE = 0.8

'''
PATH_DATASET

'''
PATH_DATASET = '../../dataset/original/'

In [117]:
dict_ds_original = {
    'data_ds3_normal_t1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t1.csv', header=None),
}

In [118]:
dict_ds = dict_ds_original.copy()

if dict_ds['data_ds3_normal_t1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0:
    raise Exception('Needs to be ?shape? divisor')

for n, dataset_name in enumerate(dict_ds):
    dataset = dict_ds[dataset_name].to_numpy()

    downsampled = dataset[::DOWNSAMPLE_FACTOR]

    x, y = downsampled.shape

    # resample
    dict_ds[dataset_name] = pd.DataFrame(downsampled.reshape((int(x/WINDOW_LENGHT),y*WINDOW_LENGHT)))

In [119]:
# ADD COLUMNS WITH DIFF PREVIOUS VALUES

if (DATA_REPRESENTATION == 2):
    frame_size = int(1000/DOWNSAMPLE_FACTOR)

    for n, dataset_name in enumerate(dict_ds):
        dataset = dict_ds[dataset_name].to_numpy()

        dimension = dataset.shape[1]
        samples = dataset.shape[0]

        # GENERATE NEW DIMENSIONS
        dataset = np.concatenate((dataset, np.zeros((samples,dimension))), axis=1)

        for f in np.arange(0,int(samples/frame_size)):
            # OBTAIN THE FRAME FLIGHT
            frame = dataset[f*frame_size:(f+1)*frame_size, 0:dimension]

            # CALCULATE DIFFERENCE
            chunk = np.diff(frame, axis=0)

            # DONT CALCULATE THE DIFFERENCE FOR EACH FIRST TIMESTEP
            chunk = np.insert(chunk, 0, frame[0, 0:dimension], axis=0)

            # UPDATE DATASET WITH NEW FRAME INTO NEW DIMENSIONS
            dataset[f*frame_size:(f+1)*frame_size,dimension:dimension*2] = chunk

        dict_ds[dataset_name] = pd.DataFrame(dataset)



In [120]:
ss = StandardScaler()

data_ds3_t1_normal = dict_ds['data_ds3_normal_t1_original']

# fit values
ss.partial_fit(data_ds3_t1_normal)

# transform values
data_ds3_t1_normal = ss.transform(data_ds3_t1_normal)

# append normal labels
data_ds3_t1_normal = np.append(data_ds3_t1_normal, np.zeros((data_ds3_t1_normal.shape[0],1)), axis = 1)


In [121]:
dimension = data_ds3_t1_normal.shape[1]-1

# FUNCTIONS AND CLASSES
class Autoencoder(nn.Module):
    def __init__(self, encode_l, decode_l):
        super().__init__()
        self.encoder = nn.Sequential(encode_l)
        self.decoder = nn.Sequential(decode_l)

    def forward(self, x):
        return self.decoder(self.encoder(x))

def run_train(net, train_loader, num_epochs, optimizer, loss_func):
    train_loss = []
    for epoch in range(num_epochs):
        running_loss = 0.0
        losses = []
        for n, (real_samples, _) in enumerate(train_loader):

            net.zero_grad()

            ### forward ###
            if cuda:
                output = net(real_samples.type(torch.FloatTensor).cuda())
                loss = loss_func(output, real_samples.type(torch.FloatTensor).cuda())
            else:
                output = net(real_samples.type(torch.FloatTensor))
                loss = loss_func(output, real_samples.type(torch.FloatTensor))

            running_loss += loss.item()
            losses.append(loss.item)

            ### backward ###
            loss.backward()
            optimizer.step()

        step_loss = running_loss / len(train_loader)
        train_loss.append(step_loss)

        ### log ###
        #print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, loss))

    return net, output, train_loss, losses

def generate_ground_truth(data_test):
    ground_truth = []

    for j, x in enumerate(data_test):

        if x.shape[0] == dimension: # X_test
            ground_truth.append(0)
        else: # % others: get column label
            if x[-1] != 0:
                ground_truth.append(1)
            else:
                ground_truth.append(0)

    return ground_truth

def generate_losses(data_test, net, loss_function):
    losses = []
    #regenerate_data = np.zeros((1, dimension))
    for j, x in enumerate(data_test):

        if x.shape[0] > dimension:
            x = x[:-1]

        real = x.reshape(1,-1).astype(np.float32)

        #regenerate_data[j] = regenerate.cpu().detach().numpy()

        if cuda:
            regenerate = net(torch.from_numpy(real).cuda())
            loss_ae = loss_function(regenerate, torch.from_numpy(real).cuda()).item()
        else:
            regenerate = net(torch.from_numpy(real))
            loss_ae = loss_function(regenerate, torch.from_numpy(real)).item()

        losses.append(loss_ae)

    return losses

def generate_y_hat(losses, loss_threshold):
    y_hat = []

    for l in losses:
        if l < loss_threshold:
            y_hat.append(0)
        else:
            y_hat.append(1)

    return y_hat

def tester(data_test, net, loss_function, loss_threshold = 1):

    ground_truth = []
    losses = []

    for n, dataset_name in enumerate(data_test):
        dataset = data_test[dataset_name]

        ground_truth = ground_truth + generate_ground_truth(dataset)

        losses = losses + generate_losses(dataset, net, loss_function)

    y_hat = generate_y_hat(losses, loss_threshold)

    return confusion_matrix(ground_truth, y_hat, normalize='true'), losses, ground_truth, y_hat

def generate_encode_decode_layers(layers, output_layer):
    od_encode = []
    od_decode = []
    # encode
    for _, layer in enumerate(layers):
        n = _ + 1

        if (len(layers) == n):
            break

        od_encode.append(('l'+str((len(od_encode)+1)), nn.Linear(layers[_],layers[n])))

        if (len(layers) != n+1):
            od_encode.append(('l'+str((len(od_encode)+1)), nn.ReLU()))

    # decode
    layers.reverse()
    for _, layer in enumerate(layers):
        n = _ + 1

        if (len(layers) == n):
            break

        od_decode.append(('l'+str((len(od_decode)+1)), nn.Linear(layers[_],layers[n])))

        if (len(layers) != n+1):
            od_decode.append(('l'+str((len(od_decode)+1)), nn.ReLU()))
        else:
            od_decode.append(('l'+str((len(od_decode)+1)), output_layer))

    return OrderedDict(od_encode), OrderedDict(od_decode)

def train(layers, last_layer, lr, epochs, batch_size, X_train, optim, loss_fnc, net = []):
    encode_l, decode_l = generate_encode_decode_layers(layers, last_layer)

    if (net == []):
        net = Autoencoder(encode_l, decode_l)

    if cuda:
        net.cuda()

    if (optim == 'ADAM'):
        optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=1e-5)
    elif(optim == 'SGD'):
        optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    elif(optim == 'RMSprop'):
        optimizer = torch.optim.RMSprop(net.parameters(), lr=lr)

    torch.manual_seed(111)

    # sets
    train_set = [
        (X_train, X_train) for i in range(len(X_train))
    ]

    train_loader = torch.utils.data.DataLoader(
        train_set, batch_size=batch_size, shuffle=False
    )

    # train
    net, output, loss, losses = run_train(net, train_loader, epochs, optimizer, loss_fnc)

    return net, output, loss, losses, loss_fnc

In [122]:
X_train, X_test, y_train, y_test = train_test_split(data_ds3_t1_normal[:, :-1], data_ds3_t1_normal[:, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

architecture = [54, 32, 9]
last_layer = nn.Tanh()
batch_size = 32
optim = 'RMSprop'
loss_fnc = nn.L1Loss()

EPOCHS = 1
block_size = 1000
log = None

block_train = int(X_train.shape[0] / block_size)
net = []
agg_loss = []
count = 0

for n in np.arange(0, block_train):

    count = count+1
    print('.', end='')

    if block_size*(n+1) < X_train.shape[0]:
        data = X_train[block_size * n : block_size*(n+1)]
    else:
        data = X_train[block_size * n:]

    net, output, loss, losses, loss_function = train(layers=architecture.copy(), last_layer=last_layer, lr=1e-3, epochs=EPOCHS, batch_size=batch_size, X_train=data, optim=optim, loss_fnc=loss_fnc, net=net)

    losses_batch_step_src = generate_losses(data_test=data, net=net, loss_function=loss_fnc)
    np_losses = np.sort(np.array(losses_batch_step_src))
    losses_batch_step = np_losses[:int(len(np_losses)*.95)]
    phi_batch_step = np.mean(losses_batch_step, dtype=np.float64) + np.std(losses_batch_step, ddof=1, dtype=np.float64)

    print("phi:", phi_batch_step)

    detector_metadata.update_one({"_id" : meta[0]["_id"]},
                                 {"$set": {"threshold": phi_batch_step}})

torch.save(net, '../app/autoencoder_model.pt')

.phi: 0.17199702762463778
.phi: 0.18638658336658653
.