<a href="https://colab.research.google.com/github/mnoorchenar/SmartMeterData/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.offline as pyo
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM

# Load the data
df = pd.read_csv('https://raw.githubusercontent.com/mnoorchenar/data/main/Smart_Meter_Data/SmartMeterData_ID_1017024.csv')  # Replace 'data.csv' with your actual data file name
df['Time'] = pd.to_datetime(df['Time'])  # Convert the 'Time' column to a datetime format
train_df = df[df['Time'].dt.year == 2021].copy()  # Subset the data for training
test_df = df[df['Time'].dt.year == 2022].copy()  # Subset the data for testing

def preProcess(data):

  # Check if a column is numeric and force it to numeric if it's not
  for col in data.columns:
      if not col=='Time':
        if not pd.api.types.is_numeric_dtype(data[col]):
            data[col] = pd.to_numeric(data[col], errors='coerce')

  # Remove rows that contain at least one NaN value
  data = data.dropna()

  # data = (data - data.min()) / (data.max() - data.min())


  # Remove columns that all values are NaN
  data = data.loc[:, ~np.isnan(data).all()]
  data.reset_index(inplace=True, drop=True)
  
  return data

train_df = preProcess(train_df)
test_df = preProcess(test_df)

print(train_df.shape)
train_df.head()

(32547, 20)


Unnamed: 0,Time,VAL,STATE,year,month,day,hour,minute,second,weekday,is_weekend,time_of_day,Temp (°C),Dew Point Temp (°C),Rel Hum (%),Precip. Amount (mm),Wind Dir (10s deg),Wind Spd (km/h),Visibility (km),Stn Press (kPa)
0,2021-01-01 05:00:00,0.054,0,2021,1,1,5,0,0,4,0,0.208333,-0.4,-3.6,79.0,0.0,13.0,11.0,16.1,100.47
1,2021-01-01 05:15:00,0.053,0,2021,1,1,5,15,0,4,0,0.21875,-0.4,-3.6,79.0,0.0,13.0,11.0,16.1,100.47
2,2021-01-01 05:30:00,0.051,0,2021,1,1,5,30,0,4,0,0.229167,-0.4,-3.6,79.0,0.0,13.0,11.0,16.1,100.47
3,2021-01-01 05:45:00,0.055,0,2021,1,1,5,45,0,4,0,0.239583,-0.4,-3.6,79.0,0.0,13.0,11.0,16.1,100.47
4,2021-01-01 06:00:00,0.056,0,2021,1,1,6,0,0,4,0,0.25,-1.5,-4.2,82.0,0.0,9.0,15.0,16.1,100.38


In [65]:
# # Normalize the data
# scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_df.drop(columns = ['Time'], axis=1))
test_data = scaler.transform(test_df.drop(columns = ['Time'], axis=1))

# Prepare the data for LSTM
def prepare_data(data, sequence_length):
    X = []
    y = []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length])
    X = np.array(X)
    y = np.array(y)
    return X, y

sequence_length = 7 # Use past 7 days of data to predict the next day
train_X, train_y = prepare_data(train_data, sequence_length)
test_X, test_y = prepare_data(test_data, sequence_length)

# Train the LSTM model
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')
model.fit(train_X, train_y, epochs=5, batch_size=16, validation_split=0.1, verbose=2)

# Make predictions on the test data
test_pred = model.predict(test_X)

# Calculate the reconstruction error for each time step
test_error = np.abs(test_pred - test_y)

# Calculate the mean and standard deviation of the reconstruction error
mean_error = np.mean(test_error)
std_error = np.std(test_error)

# Detect anomalies based on the reconstruction error
anomaly_threshold = mean_error + 3 * std_error
anomaly_indices = np.where(test_error > anomaly_threshold)[0]

Epoch 1/5
1831/1831 - 13s - loss: 0.1211 - val_loss: 0.1255 - 13s/epoch - 7ms/step
Epoch 2/5
1831/1831 - 10s - loss: 0.1206 - val_loss: 0.1254 - 10s/epoch - 6ms/step
Epoch 3/5
1831/1831 - 9s - loss: 0.1205 - val_loss: 0.1254 - 9s/epoch - 5ms/step
Epoch 4/5
1831/1831 - 9s - loss: 0.1205 - val_loss: 0.1254 - 9s/epoch - 5ms/step
Epoch 5/5
1831/1831 - 9s - loss: 0.1205 - val_loss: 0.1254 - 9s/epoch - 5ms/step


In [67]:
# Create Plotly traces for the VAL data and anomaly points
# val_trace_train = go.Scatter(x=train_df['Time'], y=train_df['VAL'], mode='lines', name='Training data')
val_trace_test = go.Scatter(x=test_df['Time'], y=test_df['VAL'], mode='lines', name='Test data')
# anomaly_trace_train = go.Scatter(x=train_df[train_df['anomaly'] == 1]['Time'], y=train_df[train_df['anomaly'] == 1]['VAL'], mode='markers', name='Anomalies in training data', marker=dict(color='red', size=8))
anomaly_trace_test = go.Scatter(x=test_df.iloc[anomaly_indices]['Time'], y=test_df.iloc[anomaly_indices]['VAL'], mode='markers', name='Anomalies in test data', marker=dict(color='red', size=8))

# Create the Plotly layout and figure
layout = go.Layout(title='Anomaly detection with One-Class SVM', xaxis=dict(title='Time'), yaxis=dict(title='Value'))
fig = go.Figure(data=[val_trace_test, anomaly_trace_test], layout=layout)

# Show the Plotly figure
fig.show()
