La proporzione tra job sintetici e originali nella classe anomala è la stessa sia nel trainset che nel testset

In [None]:
import json, os, random
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.metrics import AUC, Precision, Recall
from tcn import TCN

from collections import Counter
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

SPLIT = 0.8
NORM_TOT = 2500
ANOM_TOT = 500
DIM = 18

def normalize(x):
    norm = StandardScaler().fit_transform(json.loads(x)).tolist()
    return [val[:DIM] for val in norm]

synth = pd.read_csv(os.path.join('data', 'synth_timeseries_anom_3.csv'), sep='|', header=0)
synth['timeseries'] = synth['timeseries'].apply(normalize)

synth_train_l1, synth_test_l1 = np.split(synth, [int(len(synth)*SPLIT)])
del synth

actual = pd.read_csv(os.path.join('data', 'actual_timeseries_anom_3.csv'), sep='|', header=0)
actual['timeseries'] = actual['timeseries'].apply(normalize)

actual = actual.sample(frac=1).reset_index(drop=True)
actual = actual.groupby('label').head(NORM_TOT).reset_index(drop=True)

actual_l0 = actual.loc[actual['label'] == 0] # NORM_TOT elements
actual_l1 = actual.loc[actual['label'] == 1] # 86 elements
del actual

actual_train_l0, actual_test_l0 = np.split(actual_l0, [int(len(actual_l0)*SPLIT)])
actual_train_l1, actual_test_l1 = np.split(actual_l1, [int(len(actual_l1)*SPLIT)])
del actual_l0
del actual_l1

train_df = actual_train_l0.append(actual_train_l1.append(synth_train_l1, ignore_index=True), ignore_index=True)
test_df = actual_test_l0.append(actual_test_l1.append(synth_test_l1, ignore_index=True), ignore_index=True)
del actual_train_l0
del actual_train_l1
del synth_train_l1
del actual_test_l0
del actual_test_l1
del synth_test_l1

# build train dataset

train_X = train_df['timeseries'].tolist()
train_Y = train_df['label'].tolist()
del train_df

print(f'Train dataset: {Counter(train_Y)}')
tmp = list(zip(train_X, train_Y))
random.shuffle(tmp)
train_X, train_Y = zip(*tmp)

# build test dataset

test_X = test_df['timeseries'].tolist()
test_Y = test_df['label'].tolist()
del test_df

print(f'Test dataset: {Counter(test_Y)}')
tmp = list(zip(test_X, test_Y))
random.shuffle(tmp)
test_X, test_Y = zip(*tmp)

# Classification

In [None]:
def train_data_gen():
    while True:
        for i in range(len(train_X)):
            yield np.expand_dims(train_X[i], axis=0), np.expand_dims(train_Y[i], axis=(0, 1))

def test_data_gen():
    while True:
        for i in range(len(test_X)):
            yield np.expand_dims(test_X[i], axis=0), np.expand_dims(test_Y[i], axis=(0, 1))

TRAINSET_SIZE = len(train_X)
TESTSET_SIZE = len(test_X)
EPOCHS = 40

i = Input(batch_shape=(1, None, DIM))

o = TCN(nb_filters=8, kernel_size=4, nb_stacks=1, dilations=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024), use_layer_norm=True, dropout_rate=0.3)(i)
o = Dense(1, activation='sigmoid')(o)

m = Model(inputs=[i], outputs=[o])
m.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy', Recall(), Precision(), AUC()])

hist = m.fit(train_data_gen(), epochs=EPOCHS, steps_per_epoch=TRAINSET_SIZE, max_queue_size=1, validation_data=test_data_gen(), validation_steps=TESTSET_SIZE)
    
acc = hist.history['accuracy']
loss = hist.history['loss']
auc = hist.history['auc']
precision = hist.history['precision']
recall = hist.history['recall']
fscore = [2*((precision[i]*recall[i])/(precision[i]+recall[i])) for i in range(len(precision))]

val_acc = hist.history['val_accuracy']
val_loss = hist.history['val_loss']
val_auc = hist.history['val_auc']
val_precision = hist.history['val_precision']
val_recall = hist.history['val_recall']
val_fscore = [2*((val_precision[i]*val_recall[i])/(val_precision[i]+val_recall[i])) for i in range(len(val_precision))]

plt.title('Accuracy')
plt.plot(acc, color='#885A89')
plt.plot(val_acc, color='#613F61')
plt.show()

plt.title('Loss')
plt.plot(loss, color='#8AA8A1')
plt.plot(val_loss, color='#5B7B74')
plt.show()

plt.title('AUC')
plt.plot(auc, color='green')
plt.plot(val_auc, color='green')
plt.show()

plt.title('Precision')
plt.plot(precision, color='#CBCBD4')
plt.plot(val_precision, color='#87879B')
plt.show()

plt.title('Recall')
plt.plot(recall, color='#D1B490')
plt.plot(val_recall, color='#AE8049')
plt.show()

plt.title('FScore')
plt.plot(fscore, color='#EE7B30')
plt.plot(val_fscore, color='#B9530F')
plt.show()

f = open('history.txt', 'a+')
f.write(f'{acc}|{loss}|{auc}|{precision}|{recall}|{fscore}|{val_acc}|{val_loss}|{val_auc}|{val_precision}|{val_recall}|{val_fscore}\n')
f.close()

In [5]:
import json
import matplotlib.pyplot as plt
import numpy as np

metrics = ['acc', 'loss', 'auc', 'precision', 'recall', 'fscore', 'val_acc', 'val_loss', 'val_auc', 'val_precision', 'val_recall', 'val_fscore']
labels = ['training accuracy', 'training loss', 'training AUC-ROC', 'training precision', 'training recall', 'training F1-score', 'testing accuracy', 'testing loss', 'testing AUC-ROC', 'testing precision', 'testing recall', 'testing F1-score']
colors = ['#C33149', '#156064', '#F18F01', '#7006C7', '#C706B4', '#26A96C', '#C33149', '#156064', '#F18F01', '#7006C7', '#C706B4', '#26A96C']

for j, metric in enumerate(metrics):
    hist = open('history_3000_40.txt', 'r')
    pool = []
    for i, line in enumerate(hist):
        splt = line.split('|')
        acc = json.loads(splt[j])
        pool.append(np.array(acc))
        plt.ylim([0, 1])
        plt.plot(acc, color=colors[j], alpha=0.4+((i+1)/6)*0.3)
        plt.xlabel('epochs')
        plt.ylabel(labels[j])
    m = [np.mean(k) for k in zip(*pool)]
    print(f'{labels[j]}: {m[-1]}')
    plt.plot(m, color=colors[j])
#     plt.show()
    plt.savefig('plots/3000/' + metrics[j] + '.png')
    plt.clf()

training accuracy: 0.9861608982086182
training loss: 0.0402168445289135
training AUC-ROC: 0.996762490272522
training precision: 0.9588644623756408
training recall: 0.9578947424888611
training F1-score: 0.9583717557864718
testing accuracy: 0.9783693671226501
testing loss: 0.08703010529279709
testing AUC-ROC: 0.9890316486358642
testing precision: 0.9282669425010681
testing recall: 0.9445544600486755
testing F1-score: 0.9359512054677859


<Figure size 432x288 with 0 Axes>