In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import io
import plotly.express as px

model = tf.keras.models.load_model('lstm_autoencoder_model.keras')

df=pd.read_csv('anomaly_detection.csv',parse_dates=['Date'], index_col='Date')
# df=df.loc['2016-07-20':'2016-12-31']
df.shape

(365, 4)

In [94]:
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length + 1):
        sequences.append(data[i:i + seq_length])
    return np.array(sequences)


In [95]:
seq_len=3
sequences = create_sequences(df.values, seq_length=3)
print(sequences.shape)
# Make predictions
predictions = model.predict(sequences)
print(predictions.shape)
# Calculate loss
data_loss = tf.keras.losses.mae(predictions, sequences)
print(data_loss.shape)
data_loss_mean =np.mean(data_loss,axis=1)
print(data_loss_mean.shape)

# Check for anomalies (you need to define your threshold)
threshold = np.mean(data_loss_mean)  # Example threshold
anomalies = data_loss_mean > threshold

print(threshold)

anomalous_dates = df.index[seq_length-1:][anomalies]
print(anomalous_dates.shape)






# # Extract date range for anomalies
# anomalous_dates = df.index[seq_length-1:][anomalies]
# date_range = f"{anomalous_dates.min()} - {anomalous_dates.max()}"

(363, 3, 4)
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step
(363, 3, 4)
(363, 3)
(363,)
2.744121393640437
(175,)


In [96]:
new_results_dated=pd.DataFrame({'Dates':df.index[seq_len-1:],'Loss':data_loss_mean})
display(new_results_dated)

##Normal plot
# results_dated.plot(x='Dates',y='Loss',figsize=(10,5))
# plt.axhline(y=threshold, color='red', linestyle='--', linewidth=2, label='Threshold')

####potly plots for interactivity

fig = px.line(new_results_dated, x='Dates', y='Loss', title='Loss vs. Dates for new model', labels={'Loss': 'Loss', 'Dates': 'Dates'}) 
# Add a threshold line threshold = 0.5 # Replace with the actual threshold value
fig.add_hline(y=threshold, line=dict(color='red', width=2, dash='dash'), annotation_text='Threshold_2', annotation_position='top left') # Customize the layout 
fig.update_layout( xaxis_title='Dates', yaxis_title='Loss', legend_title='Legend', template='plotly_white') # Show the plot 

Unnamed: 0,Dates,Loss
0,2016-01-03,4.792051
1,2016-01-04,4.239948
2,2016-01-05,3.596130
3,2016-01-06,3.236550
4,2016-01-07,3.097019
...,...,...
358,2016-12-26,4.330601
359,2016-12-27,4.229176
360,2016-12-28,4.240789
361,2016-12-29,4.254258


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [97]:
potential_anom=new_results_dated.loc[new_results_dated['Loss']>threshold]

In [99]:
def find_consecutive_ranges(dates, min_days=3, max_days=14):
    consecutive_ranges = []
    start_date = dates[0]
    prev_date = dates[0]

    for date in dates[1:]:
        if (date - prev_date).days != 1:
            range_length = (prev_date - start_date).days + 1
            if min_days <= range_length <= max_days:
                consecutive_ranges.append((start_date, prev_date))
            print(f"Range: {start_date} to {prev_date} - Length: {range_length} days")
            start_date = date
        prev_date = date

    range_length = (prev_date - start_date).days + 1
    if min_days <= range_length <= max_days:
        consecutive_ranges.append((start_date, prev_date))
    print(f"Range: {start_date} to {prev_date} - Length: {range_length} days")

    return consecutive_ranges

# Identify consecutive date ranges in the DataFrame
consecutive_dates = potential_anom['Dates'].tolist()
consecutive_ranges = find_consecutive_ranges(consecutive_dates)

# Print consecutive ranges to debug
# print(f"Consecutive Ranges: {consecutive_ranges}")

# Select dates that are part of the desired ranges
selected_dates = []
for start_date, end_date in consecutive_ranges:
    selected_dates.extend(pd.date_range(start=start_date, end=end_date).tolist())
print(len(selected_dates))

# print(selected_dates)
selected_dates = [ts.date() for ts in selected_dates]
selected_dates=[date.strftime('%Y-%m-%d') for date in selected_dates]
# print(selected_dates)
start_date = df.index.min().date()  
date_list = pd.date_range(start=start_date, periods=6)
date_list = [ts.date() for ts in date_list] # Convert the list of dates to strings 
date_string_list = [date.strftime('%Y-%m-%d') for date in date_list]
print(date_string_list)

selected_dates = [date for date in selected_dates if date not in date_string_list] # Display the resulting list print(selected_dates)
# print(selected_dates)
print(len(selected_dates))
selected_df = potential_anom[potential_anom['Dates'].isin(selected_dates)]
if not selected_df.empty: 
        date_range = ", ".join([date.strftime('%Y-%m-%d %H:%M:%S') for date in selected_df['Dates']]) 
else: 
        date_range = ["No anomalies detected"]

# date_range = f"{selected_df['Dates'].min().strftime('%Y-%m-%d %H:%M:%S')} - {selected_df['Dates'].max().strftime('%Y-%m-%d %H:%M:%S')}"
print(type(date_range))
#print(date_range)

Range: 2016-01-03 00:00:00 to 2016-01-16 00:00:00 - Length: 14 days
Range: 2016-01-26 00:00:00 to 2016-03-05 00:00:00 - Length: 40 days
Range: 2016-04-02 00:00:00 to 2016-04-18 00:00:00 - Length: 17 days
Range: 2016-04-22 00:00:00 to 2016-06-14 00:00:00 - Length: 54 days
Range: 2016-08-08 00:00:00 to 2016-08-12 00:00:00 - Length: 5 days
Range: 2016-08-16 00:00:00 to 2016-08-16 00:00:00 - Length: 1 days
Range: 2016-09-07 00:00:00 to 2016-09-14 00:00:00 - Length: 8 days
Range: 2016-10-10 00:00:00 to 2016-10-25 00:00:00 - Length: 16 days
Range: 2016-11-20 00:00:00 to 2016-11-21 00:00:00 - Length: 2 days
Range: 2016-12-13 00:00:00 to 2016-12-30 00:00:00 - Length: 18 days
27
['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04', '2016-01-05', '2016-01-06']
23
<class 'str'>



The behavior of 'isin' with dtype=datetime64[ns] and castable values (e.g. strings) is deprecated. In a future version, these will not be considered matching by isin. Explicitly cast to the appropriate dtype before calling isin instead.

