In [1]:
import sys 
!{sys.executable} -m pip install --user tensorflow -q
!{sys.executable} -m pip install --user pyod -q
!{sys.executable} -m pip install --user 'tensorflow[and-cuda]' -q 
!{sys.executable} -m pip install --user optuna -q 

[0m

# First step

In [2]:
import pandas as pd 
import psutil
import json
import matplotlib.pyplot as plt
import preprocess_functions as f 
import time 
import stats_helper as helper
import numpy as np
from scipy.stats import zscore
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import MinMaxScaler

PATH = 'realKnownCause/'
ERRORPATH = 'labels/combined_labels.json'
# load error data
with open(ERRORPATH, "r") as file:
    error_file = json.load(file)



file_names = f.get_all_files(PATH) # return all file paths
time_diffs, preprocessed_dfs = list(), list()
for file_name in file_names:
    time_diff, preprocessed_df = f.process_time_series(file_name, error_file) 
    time_diffs.append(time_diff)
    preprocessed_dfs.append(preprocessed_df)

def window_sizes_freq(time_diff):
    window_sizes = dict()
    window_sizes['daily'] = int(60*60*24 / time_diff)
    window_sizes['time_of_day'] = int(window_sizes['daily'] / 4)
    return window_sizes 

window_sizes = [window_sizes_freq(timediff) for timediff in time_diffs]

# Discriminative

In [7]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from utils import extract_time 
from tensorflow.keras.callbacks import LambdaCallback

def train_test_divide(ori_data, generated_data, ori_time, generated_time):
    # Split original and generated data into training and testing sets
    train_x, test_x, train_t, test_t = train_test_split(ori_data, ori_time, test_size=0.2, random_state=42)
    train_x_hat, test_x_hat, train_t_hat, test_t_hat = train_test_split(generated_data, generated_time, test_size=0.2, random_state=42)
    
    return train_x, train_x_hat, test_x, test_x_hat, train_t, train_t_hat, test_t, test_t_hat

def t_discriminative_score_metrics(ori_data, generated_data, iterations=200, batch_size=128, patience=10):
    """Use post-hoc RNN to classify original data and synthetic data with optimizations."""
    
    ori_data = np.array(ori_data)
    generated_data = np.array(generated_data)

    # Get shape parameters
    no, seq_len, dim = ori_data.shape
    
    # Extract sequence lengths
    ori_time, ori_max_seq_len = extract_time(ori_data)
    generated_time, generated_max_seq_len = extract_time(generated_data)
    max_seq_len = max(ori_max_seq_len, generated_max_seq_len)
    
    # Network parameters
    hidden_dim = max(1, int(dim / 2))

    class Discriminator(tf.keras.Model):
        def __init__(self, hidden_dim):
            super(Discriminator, self).__init__()
            self.gru1 = tf.keras.layers.GRU(hidden_dim, return_sequences=True)
            self.gru2 = tf.keras.layers.GRU(hidden_dim, return_sequences=False)
            self.dense = tf.keras.layers.Dense(1, activation="sigmoid")

        def call(self, x):
            # Ensure input is 3D (batch_size, seq_len, feature_dim)
            if len(x.shape) == 2:  # If the input is 2D, reshape it to 3D
                x = tf.expand_dims(x, axis=0)  # Add batch dimension

            h = self.gru1(x)
            h = self.gru2(h)
            y_hat = self.dense(h)
            return y_hat

    # Instantiate the Discriminator
    discriminator = Discriminator(hidden_dim)
    optimizer = tf.keras.optimizers.Adam()
    loss_fn = tf.keras.losses.BinaryCrossentropy()

    # Train/test division for both original and generated data
    train_x, train_x_hat, test_x, test_x_hat, train_t, train_t_hat, test_t, test_t_hat = \
        train_test_divide(ori_data, generated_data, ori_time, generated_time)

    # Reshape the data to have 3 dimensions (batch_size, seq_len, features)
    train_x = train_x.reshape(-1, seq_len, 1)
    train_x_hat = train_x_hat.reshape(-1, seq_len, 1)
    test_x = test_x.reshape(-1, seq_len, 1)
    test_x_hat = test_x_hat.reshape(-1, seq_len, 1)

    # Create train_labels for the real and fake data
    train_labels_real = np.ones((train_x.shape[0], 1))
    train_labels_fake = np.zeros((train_x_hat.shape[0], 1))
    train_x_combined = np.concatenate((train_x, train_x_hat), axis=0)
    train_labels_combined = np.concatenate((train_labels_real, train_labels_fake), axis=0)

    # Shuffle data
    indices = np.arange(train_x_combined.shape[0])
    np.random.shuffle(indices)
    train_x_combined = train_x_combined[indices]
    train_labels_combined = train_labels_combined[indices]

    # Combine test data
    test_labels_real = np.ones((test_x.shape[0], 1))
    test_labels_fake = np.zeros((test_x_hat.shape[0], 1))
    test_x_combined = np.concatenate((test_x, test_x_hat), axis=0)
    test_labels_combined = np.concatenate((test_labels_real, test_labels_fake), axis=0)


    # Create TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((train_x_combined, train_labels_combined)).batch(batch_size).repeat()

    # Early stopping setup
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=patience, restore_best_weights=True)
    
    # Compile the model
    discriminator.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

    # Train the model
    discriminator.fit(
        train_dataset,  # The training data
        epochs=iterations,  # Number of training epochs
        steps_per_epoch=len(train_x_combined) // batch_size,  # Define number of steps per epoch
        validation_data=(test_x_combined, test_labels_combined),  # Validation data
        callbacks=[early_stopping]  # Early stopping and custom epoch display
    )

    # Predict on test set
    y_pred_real = discriminator(test_x)
    y_pred_fake = discriminator(test_x_hat)

    y_pred_final = np.squeeze(np.concatenate((y_pred_real.numpy(), y_pred_fake.numpy()), axis=0))
    y_label_final = np.concatenate((np.ones(len(y_pred_real)), np.zeros(len(y_pred_fake))), axis=0)

    # Compute accuracy and discriminative score
    acc = accuracy_score(y_label_final, (y_pred_final > 0.5))
    discriminative_score = np.abs(0.5 - acc)

    return discriminative_score


# Predictive

In [8]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential

def t_predictive_score_metrics(ori_data, generated_data, epochs=50, batch_size=128, n_splits=5):
    """Evaluate Post-hoc RNN one-step-ahead prediction with improvements.
    
    Args:
        - ori_data: Original time-series data
        - generated_data: Generated synthetic data
        - epochs: Number of training epochs
        - batch_size: Batch size for training
        - n_splits: Number of splits for cross-validation
        
    Returns:
        - predictive_score: Mean MAE of predictions on the original data
    """

    # Convert to NumPy arrays
    ori_data = np.asarray(ori_data, dtype=np.float32)
    generated_data = np.asarray(generated_data, dtype=np.float32)

    # Extract shape parameters
    no, seq_len, dim = ori_data.shape

    # Prepare the data for training and testing
    def prepare_data(data):
        """Prepares input-output pairs for the predictor."""
        X = np.array([d[:-1, -1:] for d in data])  # Keep only the last column (univariate time series)
        Y = np.array([d[1:, -1:] for d in data])  # The next time step prediction
        return X, Y

    # Prepare data for training
    X_ori, Y_ori = prepare_data(ori_data)
    X_gen, Y_gen = prepare_data(generated_data)

    # Define the model architecture
    def build_model(hidden_dim):
        """Build a simple RNN model."""
        model = Sequential()
        model.add(Bidirectional(GRU(hidden_dim, activation='tanh', return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.01))))
        model.add(Dropout(0.2))  # Dropout layer to reduce overfitting
        model.add(Dense(1, activation=None))  # Output layer with no activation
        model.compile(optimizer='adam', loss='mae')  # Mean Absolute Error Loss
        return model

    # Cross-validation setup
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    mae_scores = []

    # Cross-validation loop
    for train_index, val_index in kf.split(X_ori):
        X_train, X_val = X_ori[train_index], X_ori[val_index]
        Y_train, Y_val = Y_ori[train_index], Y_ori[val_index]
        
        # Create the model
        predictor = build_model(hidden_dim=max(1, int(dim / 2)))  # Hidden dimension is half the input dimension
        
        # Early stopping callback
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Train the model
        predictor.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, 
                      validation_data=(X_val, Y_val), callbacks=[early_stopping], verbose=1)
        
        # Predict on the validation set (original data)
        pred_Y = predictor.predict(X_val)

        Y_val = Y_val.reshape(-1)  # Flatten to a 1D array
        pred_Y = pred_Y.reshape(-1)  # Flatten to a 1D array

        # Calculate MAE for this fold
        mae_score = mean_absolute_error(Y_val, pred_Y)
        mae_scores.append(mae_score)

    # Compute mean MAE across all folds
    predictive_score = np.mean(mae_scores)

    return predictive_score


# Implement

In [5]:
from timegan_v2_18 import timegan 
import optuna
import time
import numpy as np
import json

In [None]:
import json
import optuna
import numpy as np

def objective(trial, data_type):
    """Optimize hyperparameters using Optuna for both sequences (daily and time_of_day)."""
    
    # Sample hyperparameters
    hidden_dim = trial.suggest_int('hidden_dim', 10, 50, step=10)
    num_layers = trial.suggest_int('num_layer', 2, 3)
    batch_size = trial.suggest_int('batch_size', 32, 256, step=32)
    module = trial.suggest_categorical('module', ['gru', 'lstm'])
    
    combined_scores = []
    
    for window_size, preprocessed_df, file_name in zip(window_sizes, preprocessed_dfs, file_names):
        if file_name != 'realKnownCause/rogue_agent_key_hold.csv':
            continue
        print(f'Processing TimeGAN for {file_name} ({data_type} sequence)')
        name = file_name.split('/')[-1].split('.')[0]
        data = np.array(preprocessed_df['value']).reshape(-1, 1)
        
        # Prepare sequences based on the data_type (daily or time_of_day)
        if data_type == 'daily':
            time_series = [data[i:i + window_size['daily']] for i in range(len(data) - window_size['daily'])]
        else:  # 'time_of_day'
            time_series = [data[i:i + window_size['time_of_day']] for i in range(len(data) - window_size['time_of_day'])]
        
        ori_data = np.array(time_series)

        # Parameters for the model
        parameters = {
            "hidden_dim": hidden_dim,
            "num_layer": num_layers,
            "iterations": 10000,
            "batch_size": batch_size,
            "module": module
        }

        # Generate synthetic data for the sequence
        generated_data = timegan(ori_data, parameters)

        # Discriminative score for the sequence
        metric_iteration = 5
        
        discriminative_scores = [
            t_discriminative_score_metrics(ori_data, generated_data, iterations=200, batch_size=batch_size, patience=10)
            for _ in range(metric_iteration)
        ]
        
        final_discri_score = np.round(np.mean(discriminative_scores), 4)
        print(f'{data_type.capitalize()} Discriminative score: {final_discri_score}')
        
        # Predictive score for the sequence
        predictive_scores = [
            t_predictive_score_metrics(ori_data, generated_data, epochs=50, batch_size=batch_size, n_splits=5)
            for _ in range(metric_iteration)
        ]
        
        final_pred_score = np.round(np.mean(predictive_scores), 4)
        print(f'{data_type.capitalize()} Predictive score: {final_pred_score}')
        
        # Combine the scores (you can adjust the weight of each score if needed)
        combined_score = final_discri_score + final_pred_score
        combined_scores.append(combined_score)
        
        if trial.should_prune():
            raise optuna.TrialPruned()
            
    # Compute average score and store trial result immediately
    average_combined_score = np.mean(combined_scores)
    
    # Create a dictionary for the current trial's results
    trial_results = {
        'trial_number': trial.number,
        'params': trial.params,
        'combined_score': average_combined_score
    }

    # Save trial result to a JSON file after every trial
    with open('timeganlogs/rogue_hold.json', 'a') as json_file:
        json.dump(trial_results, json_file)
        json_file.write('\n')  # Write each result on a new line for easier reading and appending
    
    # Return the average combined score
    return average_combined_score


# Optuna Study Setup for Daily Sequence Optimization
study_daily = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
study_daily.optimize(lambda trial: objective(trial, 'daily'), n_trials=3)  # Number of trials for daily optimization

# Optuna Study Setup for Time-of-Day Sequence Optimization (if needed)
# study_time_of_day = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
# study_time_of_day.optimize(lambda trial: objective(trial, 'time_of_day'), n_trials=20)  # Number of trials for time_of_day optimization

# Best hyperparameters and result for daily sequence
print("Best hyperparameters for daily sequence:", study_daily.best_params)
print("Best combined score for daily sequence:", study_daily.best_value)

# Store the best results
best_model_stats = {
    'best_params': study_daily.best_params
}

with open('timeganlogs/best_timegan_model_stats_rogue_hold.json', 'w') as json_file:
    json.dump(best_model_stats, json_file)


[I 2025-03-06 15:39:44,675] A new study created in memory with name: no-name-4dbef573-ea8b-49f3-b67f-0dfdcc9fa45f


Processing TimeGAN for realKnownCause/rogue_agent_key_hold.csv (daily sequence)
Training completed
Epoch 1/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.9022 - loss: 0.6871 - val_accuracy: 1.0000 - val_loss: 0.6714
Epoch 2/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9618 - loss: 0.6591 - val_accuracy: 1.0000 - val_loss: 0.6054
Epoch 3/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9982 - loss: 0.5731 - val_accuracy: 0.9911 - val_loss: 0.4657
Epoch 4/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9940 - loss: 0.4262 - val_accuracy: 0.9921 - val_loss: 0.3333
Epoch 5/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9977 - loss: 0.3076 - val_accuracy: 1.0000 - val_loss: 0.2517
Epoch 6/200
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - ac

In [None]:
def objective(trial, data_type):
    """Optimize hyperparameters using Optuna for both sequences (daily and time_of_day)."""
    
    # Sample hyperparameters
    hidden_dim = trial.suggest_int('hidden_dim', 10, 50, step=10)
    num_layers = trial.suggest_int('num_layer', 2, 3)
    batch_size = trial.suggest_int('batch_size', 32, 256, step=32)
    module = trial.suggest_categorical('module', ['gru', 'lstm'])
    
    combined_scores = []
    
    for window_size, preprocessed_df, file_name in zip(window_sizes, preprocessed_dfs, file_names):
        if file_name != 'realKnownCause/cpu_utilization_asg_misconfiguration.csv':
            continue 
        print(f'Processing TimeGAN for {file_name} ({data_type} sequence)')
        name = file_name.split('/')[-1].split('.')[0]
        data = np.array(preprocessed_df['value']).reshape(-1, 1)
        
        # Prepare sequences based on the data_type (daily or time_of_day)
        if data_type == 'daily':
            time_series = [data[i:i + window_size['daily']] for i in range(len(data) - window_size['daily'])]
        else:  # 'time_of_day'
            time_series = [data[i:i + window_size['time_of_day']] for i in range(len(data) - window_size['time_of_day'])]
        
        ori_data = np.array(time_series)

        # Parameters for the model
        parameters = {
            "hidden_dim": hidden_dim,
            "num_layer": num_layers,
            "iterations": 10000,
            "batch_size": batch_size,
            "module": module
        }

        # Generate synthetic data for the sequence
        generated_data = timegan(ori_data, parameters)

        # Discriminative score for the sequence
        metric_iteration = 5
        
        discriminative_scores = [
            t_discriminative_score_metrics(ori_data, generated_data, iterations=200, batch_size=batch_size, patience=10)
            for _ in range(metric_iteration)
        ]
        
        final_discri_score = np.round(np.mean(discriminative_scores), 4)
        print(f'{data_type.capitalize()} Discriminative score: {final_discri_score}')
        
        # Predictive score for the sequence
        predictive_scores = [
            t_predictive_score_metrics(ori_data, generated_data, epochs=50, batch_size=batch_size, n_splits=5)
            for _ in range(metric_iteration)
        ]
        
        final_pred_score = np.round(np.mean(predictive_scores), 4)
        print(f'{data_type.capitalize()} Predictive score: {final_pred_score}')
        
        # Combine the scores (you can adjust the weight of each score if needed)
        combined_score = final_discri_score + final_pred_score
        combined_scores.append(combined_score)
        
        if trial.should_prune():
            raise optuna.TrialPruned()
            
    # Return the average combined score
    return np.mean(combined_scores)


# Optuna Study Setup for Daily Sequence Optimization
study_daily = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
study_daily.optimize(lambda trial: objective(trial, 'daily'), n_trials=1)  # Number of trials for daily optimization

# Optuna Study Setup for Time-of-Day Sequence Optimization
#study_time_of_day = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
#study_time_of_day.optimize(lambda trial: objective(trial, 'time_of_day'), n_trials=20)  # Number of trials for time_of_day optimization

# Best hyperparameters and result for daily sequence
print("Best hyperparameters for daily sequence:", study_daily.best_params)
print("Best combined score for daily sequence:", study_daily.best_value)

# Store the best results
best_model_stats = {
    'best_params': best_params
}

with open('timeganlogs/best_timegan_model_stats_cpu.json', 'w') as json_file:
    json.dump(best_model_stats, json_file)


[I 2025-03-05 09:46:11,336] A new study created in memory with name: no-name-876b558d-cb88-4927-8bca-3f20f5401544


Processing TimeGAN for realKnownCause/cpu_utilization_asg_misconfiguration.csv (daily sequence)
Training completed
Epoch 1/200
[1m888/888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 14ms/step - accuracy: 0.6124 - loss: 0.6425 - val_accuracy: 0.9979 - val_loss: 0.2107
Epoch 2/200
[1m888/888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.8440 - loss: 0.4152 - val_accuracy: 1.0000 - val_loss: 0.1153
Epoch 3/200
[1m888/888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.9955 - loss: 0.1109 - val_accuracy: 1.0000 - val_loss: 0.0676
Epoch 4/200
[1m888/888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 1.0000 - loss: 0.0601 - val_accuracy: 1.0000 - val_loss: 0.0422
Epoch 5/200
[1m888/888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 1.0000 - loss: 0.0380 - val_accuracy: 1.0000 - val_loss: 0.0275
Epoch 6/200
[1m888/888[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [None]:
best_model_stats