In [1]:
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense ,Input, LSTM, Concatenate, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import joblib

2024-05-09 22:53:15.834910: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-09 22:53:15.834953: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-09 22:53:15.834978: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-09 22:53:15.917270: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
window = 24
horizon = 6

In [3]:
df = pd.read_csv('data/netstats_hourly_4_3.csv')

**Load Artifacts**

In [4]:
# using root mean square error to punish model more heavily for missing spikes
def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

In [5]:
model_path = './model/lstm_model.keras' 

model = load_model(model_path, custom_objects={'rmse': rmse})

# Load the saved OneHotEncoder fit object
ohefit = joblib.load('./model/ohefit.save')

In [6]:
def feature_create(df_in):
    df = df_in.copy()
    
    df.loc[:,'link_loc'] = df['link'] + "_" + df['location']
    
    df['rx_gbs'] = pd.to_numeric(df['rx_gbs'], errors='coerce')
    df['tx_gbs'] = pd.to_numeric(df['tx_gbs'], errors='coerce')
    
    df['rx_gbs_delta'] = df.groupby('link_loc')['rx_gbs'].diff()

    # Calculate tx_bytes_delta_delta
    df['tx_gbs_delta'] = df.groupby('link_loc')['tx_gbs'].diff()

    # The first delta_delta value for each site_host group will be NaN because there's no previous value to subtract from.
    # You might want to fill these NaN values depending on your requirements, for example, with 0s:
    df['rx_gbs_delta'] = df['rx_gbs_delta'].fillna(0)
    df['tx_gbs_delta'] = df['tx_gbs_delta'].fillna(0)
    
    # ensure 'time' is a datetime64 type
    df['time'] = pd.to_datetime(df['time'])

    # Extract hour of day
    df['hour_of_day'] = df['time'].dt.hour

    # Extract day of the week (Monday=0, Sunday=6)
    df['day_of_week'] = df['time'].dt.dayofweek
    
        # Encode 'hour_of_day' cyclically
    df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day']/24)

    # Encode 'day_of_week' cyclically
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week']/7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week']/7)
        
    df.drop(columns=['hour_of_day','day_of_week'],inplace=True)

    # Transform the new data using the loaded OneHotEncoder fit object
    new_data_encoded = ohefit.transform(df[['link', 'location']])
    df_encoded = pd.concat([df,new_data_encoded],axis = 1).drop(columns= ['link', 'location','link_loc'])
    
    #ensure df is sorted by time 
    df_encoded.sort_values(by=['time'],ascending=True,inplace=True)
  
    # scale
    #features_to_scale = [col for col in df_encoded.columns if 'delta' in col]
    # Transform the selected columns using the loaded StandardScaler
    #df_encoded[features_to_scale] = scaler.transform(df_encoded[features_to_scale])
    
    # Temporal features, ensure you pass just number of observations equal to window
    temporal_features = df_encoded[['rx_gbs', 'rx_gbs_delta', 'tx_gbs', 'tx_gbs_delta']].tail(window).values
    temporal_features = temporal_features.reshape(1, window, 4)  # Reshaping to match the input shape expected by the LSTM

    # Non-temporal features (one-hot encoded site and host)
    non_temporal_features_list = [col for col in df_encoded.columns if 'link' in col or 'location' in col]

    # Select the last row for non-temporal features and reshape
    non_temporal_features = df_encoded[non_temporal_features_list].iloc[-1].values
    non_temporal_features = non_temporal_features.reshape(1, -1)  # Reshaping to 1 row, with columns inferred

    # Semi-temporal features
    semi_temporal_features = df_encoded[['hour_sin', 'hour_cos', 'day_sin', 'day_cos']].iloc[-1].values
    semi_temporal_features = semi_temporal_features.reshape(1, 4)  # Reshaping to match the expected input shape
    
    del(df)
    del(df_encoded)
    
    return temporal_features, non_temporal_features, semi_temporal_features

Pick a random link / location

In [7]:
sample_df = df[['link','location']].drop_duplicates().sample()
link_pick = sample_df['link'].values[0]
location_pick = sample_df['location'].values[0]

# test_df has only one combintation  of link_pick and location_pick
test_df = df[(df.link == link_pick) & (df.location == location_pick)]
test_df.sort_values(by=['time'],inplace=True)

# pick a random segment of size window + 1
start_row = np.random.randint(0, len(test_df) - (window+1))
random_segment = df.iloc[start_row:start_row + (window+1)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.sort_values(by=['time'],inplace=True)


In [8]:
temporal_features, non_temporal_features, semi_temporal_features = feature_create(random_segment)

# Combine into a list for prediction as the model expects
model_input = [temporal_features, non_temporal_features, semi_temporal_features]


prediction = model.predict(model_input)



In [9]:
prediction

array([[[14.540914 , -0.696883 ],
        [12.542822 , -1.6714047],
        [ 9.879547 , -2.334466 ],
        [ 6.973678 , -2.5771434],
        [ 4.6453276, -2.242193 ],
        [ 2.9436607, -1.8395052]]], dtype=float32)

In [15]:
prediction.shape

(1, 6, 2)

In [10]:
prediction[:, :,0].tolist()

[[14.540913581848145,
  12.542821884155273,
  9.879547119140625,
  6.973678112030029,
  4.645327568054199,
  2.9436607360839844]]

In [11]:
prediction.reshape(6,2)

array([[14.540914 , -0.696883 ],
       [12.542822 , -1.6714047],
       [ 9.879547 , -2.334466 ],
       [ 6.973678 , -2.5771434],
       [ 4.6453276, -2.242193 ],
       [ 2.9436607, -1.8395052]], dtype=float32)

In [12]:
from sklearn.metrics import root_mean_squared_error

In [13]:
actuals = random_array = np.random.rand(6, 2)

In [14]:
actuals

array([[0.154385  , 0.072896  ],
       [0.76268352, 0.67854077],
       [0.33468818, 0.79431822],
       [0.13507733, 0.79170452],
       [0.44860849, 0.74289053],
       [0.88491224, 0.28844935]])

In [15]:
root_mean_squared_error(actuals, prediction.reshape(6,2))

5.891349550530071

In [31]:
prediction.reshape((6,2))[:,0].tolist()

[0.40420058369636536,
 0.45891812443733215,
 0.49683326482772827,
 0.4980129599571228,
 0.528127908706665,
 0.5192015171051025]