In [14]:
import os
import re
import math
import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

from numpy.random import seed
import tensorflow as tf

from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector, Flatten
from keras.models import Model
from keras import regularizers

In [2]:
# Define dataset to import

ignore_ids = [223, 45, 19, 105, 75, 63, 58, 59]
path_data = 'C:/Users/Orson/Documents/Digital_Lab/Projet_GAC/Datasets/'

def get_selected_ids(selection):
    # Get ids
    meta = pd.read_csv(path_data+"metadata.csv", sep=';')
    # Use selection
    for select in selection:
        if select.startswith("max"):
            if selection[select] is not None:
                col = select[4:]
                meta = meta[meta[col] <= selection[select]]
        elif select.startswith("min"):
            if selection[select] is not None:
                col = select[4:]
                meta = meta[meta[col] >= selection[select]]
        elif select.startswith("in"):
            if selection[select]:
                col = select[3:]
                meta = meta[meta[col].isin(selection[select])]
        else:
            col = select
            meta = meta[meta[col] == selection[col]]
    # Remove manual ids
    meta = meta[~meta["bat_id"].isin(ignore_ids)]
    ids = meta["bat_id"].values
    return ids

# get list of buildings data corresponding to the ids selected

def get_list_of_datasets(ids):
    list_of_datasets = []
    list_of_ids = []
    for filename in os.listdir(path_data):
        if re.fullmatch("^[0-9]+.csv", filename) and int(filename[:filename.find('.')]) in ids:
            list_of_datasets.append(load_file(path_data+filename))
            list_of_ids.append(int(filename[:filename.find('.')]))
    return list_of_datasets, list_of_ids

# Load files

def load_file(path):
    return pd.read_csv(path, sep=";", header=0, infer_datetime_format=True, parse_dates=['timestamp'], index_col=['timestamp'])

In [3]:
selection = {'time_step': 15,
             'is_house' : False,
             'min_bat_id': None,
             'max_bat_id': None,
             'in_bat_id': []}
ids = get_selected_ids(selection)
list_of_datasets, list_of_ids = get_list_of_datasets(ids)

In [None]:
# scale each building data and return an array with all the scaled datasets

def scaling(data):
    data_scaled = []
    for i in range(len(data)):
        df_bat = data[i]
        scaler = MinMaxScaler()
        bat_scaled = scaler.fit_transform(df_bat)
        # if the datasets feature the temperature, it is scaled too and included in the model
        if 'temperature' in df_bat.columns:
            df_bat_scaled = pd.DataFrame({'timestamp':df_bat.index,
                                          'active_power':bat_scaled[:,0],
                                          'temperature':bat_scaled[:,1],
                                          'bat_id':list_of_ids[i]}).set_index('timestamp')
        else:
            df_bat_scaled = pd.DataFrame({'timestamp':df_bat.index,
                                          'active_power':bat_scaled[:,0],
                                          'bat_id':list_of_ids[i]}).set_index('timestamp')
        data_scaled.append(df_bat_scaled)

# create sequences of a predetermined duration from time series
# the model learns to reconstruct these sequences

def create_sequences(sequence, time_steps):
    output = []
    for i in range(len(sequence) - time_steps):
        if np.all(sequence.values[i : (i + time_steps)][:,1] == sequence.values[i : (i + time_steps)][0,1]):
            output.append(sequence[i : (i + time_steps)])
    return np.stack(output)

# build train set and test set from the data obtained using get_list_of_datasets

def get_train_test_sets(data, time_steps):
    # scale data
    data_scaled = scaling(data)
    # if the datasets feature the temperature, it is included in the model 
    if 'temperature' in data[0].columns:
        df_train = pd.DataFrame(columns=['timestamp','active_power','temperature','bat_id']).set_index('timestamp')
        df_test = pd.DataFrame(columns=['timestamp','active_power','temperature','bat_id']).set_index('timestamp')
    else:
        df_train = pd.DataFrame(columns=['timestamp','active_power','bat_id']).set_index('timestamp')
        df_test = pd.DataFrame(columns=['timestamp','active_power','bat_id']).set_index('timestamp')
    # we take the first 80% of the dataset for training data and the rest for test data
    for i in range(math.floor(len(data)*0.8)):
        df_train = pd.concat([df_train,data_scaled[i]])
    for i in range(math.floor(len(data)*0.8), len(data)):
        df_test = pd.concat([df_test,data_scaled[i]])
    # we create the sequences from the train and test data
    X_train = create_sequences(df_train,time_steps)
    X_test = create_sequences(df_test,time_steps)
    
    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    if 'temperature' in df_train.columns:
        X_train = X_train[:,:,0:2]
        X_test = X_test[:,:,0:2]
    else:
        X_train = X_train[:,:,0]
        X_test = X_test[:,:,0]
    # remove sequences with NaNs
    X_train = X_train[~np.isnan(X_train).any(axis=(1,2))]
    X_test = X_test[~np.isnan(X_test).any(axis=(1,2))]

    return X_train, X_test

In [None]:
def autoencoder_model(X):
    inputs = Input(shape=(X.shape[1],X.shape[2]))
    L1 = LSTM(32, activation='relu',return_sequences=True,
              kernel_regularizer=regularizers.l2(0.00))(inputs)
    D1 = Dropout(rate=0.2)(L1)
    L2 = LSTM(4, activation='relu', return_sequences=False)(D1)
    L3 = RepeatVector(X.shape[1])(L2)
    L4 = LSTM(4, activation='relu', return_sequences=True)(L3)
    L5 = LSTM(32, activation='relu', return_sequences=True)(L4)
    D2 = Dropout(rate=0.2)(L5)
    output = TimeDistributed(Dense(X.shape[2]))(D2)
    model = Model(inputs=inputs, outputs=output)
    return model   

In [None]:
X_train, X_test = get_train_test_sets(list_of_datasets)
model = autoencoder_model(X_train)
model.compile(optimizer='adam', loss='mae')
model.summary()

In [None]:
nb_epochs = 5
# maybe not the best idea not to use a predetermined validation set for time series
history = model.fit(X_train,X_train,epochs=nb_epochs,validation_split=0.1).history

In [None]:
# plot evolution of loss and validation loss

plt.figure()
plt.plot(history['loss'], label='Training loss')
plt.plot(history['val_loss'], label='Validation loss')

In [None]:
X_train_pred = model.predict(X_train)
# get reconstruction error for each sample of the training set
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)[:,0]
plt.figure()
plt.hist(train_mae_loss, bins=50)
plt.xlabel('Train MAE loss')
plt.ylabel('Number of samples')

In [None]:
X_test_pred = model.predict(temp_test)
# get reconstruction error for each sample of the test set
test_mae_loss = np.mean(np.abs(X_test_pred-temp_test), axis=1)[:,0]
plt.figure()
plt.hist(test_mae_loss, bins=20)
plt.xlabel('Test MAE loss')
plt.ylabel('Number of samples')

In [None]:
# arbitrary threshold
threshold = 0.15
anomalies = test_mae_loss > threshold

In [None]:
# get indices of anomalous data in the test set
anomalous_data_indices = []
for data_idx in range(TIME_STEPS - 1, len(df_test) - TIME_STEPS + 1):
    if np.all(anomalies[data_idx - (TIME_STEPS) + 1 : data_idx]):
        anomalous_data_indices.append(data_idx)
        
# get values of anomalous data
df_anomalies = df_test.iloc[anomalous_data_indices]

# get ids of buildings that have anomalous data
list_id = np.unique(df_anomalies['bat_id'])

In [None]:
# get consumption data of buildings that have anomalies
list_dfs = []
for bat_id in list_id:
    df_bat = df_test[df_test['bat_id'] == bat_id]
    list_dfs.append(df_bat)

# get anomalous data samples 
list_dfs_anomalies = []
for bat_id in list_id:
    df_bat = df_anomalies[df_anomalies['bat_id']==bat_id]
    list_dfs_anomalies.append(df_bat)

In [None]:
%matplotlib inline

# plot anomalous data alongside the consumption timeline of buildings

fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(14,10))
fig.tight_layout()
n = len(list_id)
for i in range(n):
    df_bat = list_dfs[i]
    df_bat_anomalies = list_dfs_anomalies[i]
    plt.subplot(5,3,i+1)
    plt.plot(df_bat[['active_power']])
    plt.plot(df_bat_anomalies[['active_power']],linestyle='',marker='.',color='red')
plt.savefig(path_data+'145buildings_anomalies.png')
plt.show()