In [4]:
# This is to remove test data from windowed data for training
import os
training_years = ['2020', '2021']
for window_data in os.listdir():
    if 'OBSEA_' not in window_data:
        continue
    
    for fname in os.listdir(window_data + '/OBSEA'):
        if '_label' not in fname:
            continue
        
        year = fname[:4]
        if year not in training_years:
            os.remove(window_data + '/OBSEA/' + fname)
            fname = fname.replace('_label', '')
            os.remove(window_data + '/OBSEA/' + fname)

In [8]:
# prepare train_val_split
from sklearn.model_selection import train_test_split
import pandas as pd

# read metrics generated by create_window_data
df = pd.read_csv('metrics.csv', index_col=0)
df = df[(df.index.str.contains('2022')) | (df.index.str.contains('2023'))]# | (df.index.str.contains('2021'))] 
df['label'] = df.idxmax(axis=1)
df['label'].value_counts()

# split data for training and valication
validation_size = 0.15
num_samples = len(df)
val_samples = int(num_samples * validation_size)

# Get counts of each class to balance
class_counts = df['label'].value_counts()
min_class_val_size = int(val_samples / len(class_counts))  # Equal number from each class

# Collect indices for the validation set
val_indices = []
for label in class_counts.index:
    indices = df[df['label'] == label].index.tolist()
    val_indices.extend(indices[:min_class_val_size])

# Remaining indices are for training
train_indices = list(set(df.index) - set(val_indices))

# Split the data
train_df = df.loc[train_indices]
val_df = df.loc[val_indices]

In [9]:
print(f"Training Set Size: {train_df.shape[0]}")
print(f"Validation Set Size: {val_df.shape[0]}")
print("\nTraining Set Distribution:")
print(train_df['label'].value_counts())
print("\nValidation Set Distribution:")
print(val_df['label'].value_counts())


Training Set Size: 459
Validation Set Size: 72

Training Set Distribution:
AutoEncoder (AE)              300
CBLOF                          69
HBOS                           26
LOF                            23
RobustPCA                      17
DenoisingAutoEncoder (DAE)     13
PCC                             9
COPOD                           2
Name: label, dtype: int64

Validation Set Distribution:
AutoEncoder (AE)              9
CBLOF                         9
HBOS                          9
LOF                           9
RobustPCA                     9
DenoisingAutoEncoder (DAE)    9
PCC                           9
COPOD                         9
Name: label, dtype: int64


In [10]:
train_df = train_df.index.tolist() 
val_df = val_df.index.tolist() 
train_df = [x + '.csv' for x in train_df]
val_df = [x + '.csv' for x in val_df]
data2df = {'train_set': train_df,
           'val_set': val_df}
data2df = pd.DataFrame.from_dict(data2df, orient='index')
data2df.to_csv('train_val_split.csv')

In [None]:
# remember to move train_val_split.csv to reproducibility_guide/