In [1]:
import os
import csv
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re

In [10]:
folder_path = r'C:\Users\marci\Documents\PhD\TIme Series - AUC - Area Under the Curve\12 Hours'

# Get filenames in the specified folder_path
for filename in os.listdir(folder_path):   
    file_path = os.path.join(folder_path, filename)

    # Work with files ending with .txt
    if filename.endswith('.txt'):
        # Print the current filename
        print("***** " + filename + " *****")
        pattern = r'_(\d+)_(\d+)_(\d+)\.txt'
        matches = re.search(pattern, filename)
        train_index = int(matches.group(1)) #19823 (24 Hours)
        test_index_start = int(matches.group(2)) #22032 (24 Hours)
        test_index_end = int(matches.group(3)) #22055 (24 Hours)

        ###
        # ALL Data
        ###
        initial_datetime = datetime(2021, 1, 1, 0, 0, 0)  # 2021-01-01 00:00:00
        electricity_df = pd.read_csv(file_path, header=None, names=['Energy'], dtype=float)
        electricity_date_time_df = [initial_datetime + timedelta(hours=i) for i in range(len(electricity_df))]
        electricity_df.index = electricity_date_time_df

        ###
        # TRAIN Data: mean, standard deviation, max, and min
        ###
        train_df = electricity_df[:train_index]
        daily_energy_sum = train_df.groupby(train_df.index.date).sum()
        daily_energy_sum_df = daily_energy_sum.reset_index()
        daily_energy_sum_df.columns = ['Date', 'Energy']
        daily_energy_sum_df['Date'] = pd.to_datetime(daily_energy_sum_df['Date'])
        daily_energy_sum_df['Day_Name'] = daily_energy_sum_df['Date'].dt.day_name()
        train_daily_energy_sum_df = daily_energy_sum_df
        
        ###
        # TEST Data: sum and compare with TRAIN Data
        ###
        test_df = electricity_df[train_index + 1:]
        daily_energy_sum = test_df.groupby(test_df.index.date).sum()
        daily_energy_sum_df = daily_energy_sum.reset_index()
        daily_energy_sum_df.columns = ['Date', 'Energy']
        
        threshold_multiplier = 3
        days_above_threshold = []
        days_below_threshold = []
        
        for _, row in daily_energy_sum_df.iterrows():
            day_name = pd.Timestamp(row['Date']).day_name()  # Extract day name from the date
            mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == day_name]['Energy'].mean()
            std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == day_name]['Energy'].std()
            max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == day_name]['Energy'].max()
            min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == day_name]['Energy'].min()
            upper_threshold = mean_energy + (threshold_multiplier * std_energy)
            lower_threshold = mean_energy - (threshold_multiplier * std_energy)
            if row['Energy'] > upper_threshold:
                days_above_threshold.append(row)
                print("TEST Above Threshold: " + str(row['Date']) + " " + str(row['Energy']) + " " + str(upper_threshold))
            elif row['Energy'] < lower_threshold:
                days_below_threshold.append(row)
                print("TEST Below Threshold: " + str(row['Date']) + " " + str(row['Energy']) + " " + str(lower_threshold))

        ###
        # ANOMALY Data: sum and compare with TRAIN Data
        ###
        anomaly_df = electricity_df[test_index_start:test_index_start + 23]
        daily_energy_sum = anomaly_df.groupby(anomaly_df.index.date).sum()
        daily_energy_sum_df = daily_energy_sum.reset_index()
        daily_energy_sum_df.columns = ['Date', 'Energy']
        
        threshold_multiplier = 3
        days_above_threshold = []
        days_below_threshold = []
        
        for _, row in daily_energy_sum_df.iterrows():
            day_name = pd.Timestamp(row['Date']).day_name()  # Extract day name from the date
            mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == day_name]['Energy'].mean()
            std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == day_name]['Energy'].std()
            max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == day_name]['Energy'].max()
            min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == day_name]['Energy'].min()
            upper_threshold = mean_energy + (threshold_multiplier * std_energy)
            lower_threshold = mean_energy - (threshold_multiplier * std_energy)
            print("ANOMALY Statistics: " + str(row['Date']) + " " + str(day_name) + " " + str(row['Energy']) + " " + str(upper_threshold))
            print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
            print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))
            if row['Energy'] > upper_threshold:
                days_above_threshold.append(row)
                print("ANOMALY Above Threshold: " + str(row['Date']) + " " + str(day_name) + " " + str(row['Energy']) + " " + str(upper_threshold))
                print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
                print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))
            if row['Energy'] < lower_threshold:
                days_below_threshold.append(row)
                print("ANOMALY Below Threshold: " + str(row['Date']) + " " + str(day_name) + " " + str(row['Energy']) + " " + str(lower_threshold))
                print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
                print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))

print("*******************")
print("*******************")
print("*******************")
mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Monday"]['Energy'].mean()
std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Monday"]['Energy'].std()
max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Monday"]['Energy'].max()
min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Monday"]['Energy'].min()
upper_threshold = mean_energy + (threshold_multiplier * std_energy)
lower_threshold = mean_energy - (threshold_multiplier * std_energy)
print("TRAINING STATISTICS: " + str("Monday"))
print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))

mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Tuesday"]['Energy'].mean()
std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Tuesday"]['Energy'].std()
max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Tuesday"]['Energy'].max()
min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Tuesday"]['Energy'].min()
upper_threshold = mean_energy + (threshold_multiplier * std_energy)
lower_threshold = mean_energy - (threshold_multiplier * std_energy)
print("TRAINING STATISTICS: " + str("Tuesday"))
print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))

mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Wednesday"]['Energy'].mean()
std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Wednesday"]['Energy'].std()
max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Wednesday"]['Energy'].max()
min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Wednesday"]['Energy'].min()
upper_threshold = mean_energy + (threshold_multiplier * std_energy)
lower_threshold = mean_energy - (threshold_multiplier * std_energy)
print("TRAINING STATISTICS: " + str("Wednesday"))
print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))

mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Thursday"]['Energy'].mean()
std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Thursday"]['Energy'].std()
max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Thursday"]['Energy'].max()
min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Thursday"]['Energy'].min()
upper_threshold = mean_energy + (threshold_multiplier * std_energy)
lower_threshold = mean_energy - (threshold_multiplier * std_energy)
print("TRAINING STATISTICS: " + str("Thursday"))
print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))

mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Friday"]['Energy'].mean()
std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Friday"]['Energy'].std()
max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Friday"]['Energy'].max()
min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Friday"]['Energy'].min()
upper_threshold = mean_energy + (threshold_multiplier * std_energy)
lower_threshold = mean_energy - (threshold_multiplier * std_energy)
print("TRAINING STATISTICS: " + str("Friday"))
print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))

mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Saturday"]['Energy'].mean()
std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Saturday"]['Energy'].std()
max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Saturday"]['Energy'].max()
min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Saturday"]['Energy'].min()
upper_threshold = mean_energy + (threshold_multiplier * std_energy)
lower_threshold = mean_energy - (threshold_multiplier * std_energy)
print("TRAINING STATISTICS: " + str("Saturday"))
print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))

mean_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Sunday"]['Energy'].mean()
std_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Sunday"]['Energy'].std()
max_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Sunday"]['Energy'].max()
min_energy = train_daily_energy_sum_df[train_daily_energy_sum_df['Day_Name'] == "Sunday"]['Energy'].min()
upper_threshold = mean_energy + (threshold_multiplier * std_energy)
lower_threshold = mean_energy - (threshold_multiplier * std_energy)
print("TRAINING STATISTICS: " + str("Sunday"))
print("---> TRAIN (mean/std/max/min): " + str(mean_energy) + " " + str(std_energy) + " " + str(max_energy) + " " + str(min_energy))
print("---> TRAIN (upper/lower): " + str(upper_threshold) + " " + str(lower_threshold))

***** 000_UCR_Anomaly_IESOToronto202120222023_19823_22032_22043.txt *****
ANOMALY Statistics: 2023-07-08 Saturday 130817.0 168111.98793407597
---> TRAIN (mean/std/max/min): 129361.17796610169 12916.936655991427 170444.0 108621.0
---> TRAIN (upper/lower): 168111.98793407597 90610.36799812742
***** 001_UCR_Anomaly_IESOToronto202120222023_19823_22032_22043.txt *****
ANOMALY Statistics: 2023-07-08 Saturday 131255.83279946918 168111.98793407597
---> TRAIN (mean/std/max/min): 129361.17796610169 12916.936655991427 170444.0 108621.0
---> TRAIN (upper/lower): 168111.98793407597 90610.36799812742
***** 002_UCR_Anomaly_IESOToronto202120222023_19823_22032_22043.txt *****
ANOMALY Statistics: 2023-07-08 Saturday 126514.0 168111.98793407597
---> TRAIN (mean/std/max/min): 129361.17796610169 12916.936655991427 170444.0 108621.0
---> TRAIN (upper/lower): 168111.98793407597 90610.36799812742
***** 003_UCR_Anomaly_IESOToronto202120222023_19823_22032_22043.txt *****
ANOMALY Statistics: 2023-07-08 Saturday 