In [1]:
# Step1:

import os
import numpy as np
import re
from scipy.fft import fft
from sklearn.model_selection import train_test_split

def load_data(mode):
    data = []
    labels = []
    mode_directory = f"./{mode}/"
    
    # Get the list of file names and extract numbers from the file names
    file_names = os.listdir(mode_directory)
    file_numbers = [int(re.findall(r'\d+', file_name)[0]) for file_name in file_names]
    sorted_file_indices = np.argsort(file_numbers)
    sorted_file_names = [file_names[i] for i in sorted_file_indices]

#     print(sorted_file_names) 
        
    for file_name in sorted_file_names:
        if file_name.endswith('.txt'):
            file_path = os.path.join(mode_directory, file_name)
            with open(file_path, 'r') as file:
                signal_data = [float(value) for value in file.read().split()]

                # Check if the file is empty before adding it to the data
                if len(signal_data) > 0:
                    data.append(signal_data)
                    labels.append(0 if mode == 'base/ModeM' else 1)  # 0 for ModeM, 1 for normal modes

    return np.array(data), np.array(labels)

def extract_features(data):
    # Apply Fast Fourier Transform (FFT) on each signal
    transformed_data = []
    for signal in data:
        fft_result = np.abs(fft(signal))
        transformed_data.append(fft_result)

    return np.array(transformed_data)

# Load data from all modes (A, B, C, D, and M)
mode_a_data, mode_a_labels = load_data('base/ModeA')
mode_b_data, mode_b_labels = load_data('base/ModeB')
mode_c_data, mode_c_labels = load_data('base/ModeC')
mode_d_data, mode_d_labels = load_data('base/ModeD')
mode_m_data, mode_m_labels = load_data('base/ModeM')

# Create random samples for training and testing by using train_test_split with 0.1
validation_ratio = 0.1 
mode_a_data_train, mode_a_data_test, mode_a_labels_train, mode_a_labels_test = train_test_split(
    mode_a_data, mode_a_labels, test_size=validation_ratio, random_state=42)

mode_b_data_train, mode_b_data_test, mode_b_labels_train, mode_b_labels_test = train_test_split(
    mode_b_data, mode_b_labels, test_size=validation_ratio, random_state=42)

mode_c_data_train, mode_c_data_test, mode_c_labels_train, mode_c_labels_test = train_test_split(
    mode_c_data, mode_c_labels, test_size=validation_ratio, random_state=42)

mode_d_data_train, mode_d_data_test, mode_d_labels_train, mode_d_labels_test = train_test_split(
    mode_d_data, mode_d_labels, test_size=validation_ratio, random_state=42)

mode_m_data_train, mode_m_data_test, mode_m_labels_train, mode_m_labels_test = train_test_split(
    mode_m_data, mode_m_labels, test_size=validation_ratio, random_state=42)


# Combine normal mode data and labels into one dataset
normal_mode_data_train = np.concatenate((mode_a_data_train, mode_b_data_train, mode_c_data_train, mode_d_data_train), axis=0)
normal_mode_labels_train = np.concatenate((mode_a_labels_train, mode_b_labels_train, mode_c_labels_train, mode_d_labels_train), axis=0)


# Extract features using FFT
baseline_data = extract_features(normal_mode_data_train)


In [2]:
baseline_data.shape

(359, 20000)

In [3]:
# Step2:

import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Function to create and train the standard Autoencoder (AUE)
# Standard Autoencoder:
def create_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(128, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    return autoencoder

# Sparse Autoencoder:
from keras import regularizers

def create_sparse_autoencoder(input_dim, sparsity_factor=0.1):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(128, activation='relu', activity_regularizer=regularizers.l1(sparsity_factor))(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    return autoencoder


# Create and train the AUE using the baseline data
input_dim = baseline_data.shape[1]
autoencoder = create_autoencoder(input_dim)
autoencoder.fit(baseline_data, baseline_data, epochs=50, batch_size=32, shuffle=True)

# Function to compute the reconstruction error for each baseline data point
def compute_reconstruction_errors(data, autoencoder):
    reconstructed_data = autoencoder.predict(data)
    errors = np.mean((data - reconstructed_data) ** 2, axis=1)
    return errors

# Compute the reconstruction errors for the baseline data
baseline_errors = compute_reconstruction_errors(baseline_data, autoencoder)

# Sort the errors in ascending order to get the strangeness training list
strangeness_training_list = sorted(baseline_errors)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [4]:
strangeness_training_list

[32487489.433941748,
 34954822.47939434,
 35373177.00636929,
 36062298.46131416,
 36398516.079635724,
 36421246.72735629,
 37111511.85174423,
 37572521.95020863,
 37757022.678125314,
 37942187.595069535,
 37981382.2094477,
 38011967.97947988,
 38097885.71071648,
 38173329.517251864,
 38253546.52345376,
 38553443.23279424,
 38725026.060353994,
 38760227.21303732,
 38879549.440432124,
 38884581.62048061,
 39029589.45943749,
 39136480.673666194,
 39512396.80635826,
 39843505.85360439,
 39881047.69433657,
 40158314.82839735,
 40163190.16098584,
 40272271.31767195,
 40482750.635243416,
 40547361.666336305,
 40791367.87699693,
 40910419.479755245,
 41319078.57384192,
 41353056.32492645,
 41363484.93158914,
 41775962.213933006,
 41874902.19310641,
 42026355.92233774,
 42120001.88429694,
 42135630.04760939,
 42136807.28887127,
 42189588.880703226,
 42190766.75364763,
 42319363.78288391,
 42486937.073319085,
 42600066.10237312,
 42601607.429063134,
 42771733.30478636,
 42790830.56142981,
 42870

In [5]:
# Step3

# Combine test datasets
test_data = np.concatenate((mode_a_data_test, mode_b_data_test, mode_c_data_test, mode_d_data_test, mode_m_data_test), axis=0)

# Extract features using FFT
test_data_fft = extract_features(test_data)

# Combine test labels
test_labels = np.concatenate((mode_a_labels_test, mode_b_labels_test, mode_c_labels_test, mode_d_labels_test, mode_m_labels_test), axis=0)



In [6]:
mode_a_labels_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [7]:
# step4

# Use the Autoencoder to predict the reconstructed data for the test set
test_data_reconstructed = autoencoder.predict(test_data_fft)

# Calculate the reconstruction error for each data point
reconstruction_errors_test = np.mean((test_data_fft - test_data_reconstructed)**2, axis=1)

# Form a list of strangeness values for the test set
strangeness_values_test = reconstruction_errors_test.tolist()




In [8]:
# Step5

test_p_values = []
N = len(strangeness_training_list)

for test_strangeness in strangeness_values_test:
    # Find the index of the test strangeness in the sorted training list
    index = np.searchsorted(strangeness_training_list, test_strangeness)
    
    # Calculate the number of measures in the training list that are higher or equal to the test strangeness
    b = N - index
    
    # Calculate the p-value
    p_value = (b + 1) / (N + 1)
    
    # Append the p-value to the list of test p-values
    test_p_values.append(p_value)


In [9]:
test_p_values

[0.7333333333333333,
 0.7583333333333333,
 0.9305555555555556,
 0.4361111111111111,
 0.7333333333333333,
 0.5638888888888889,
 0.825,
 0.7055555555555556,
 0.55,
 0.43333333333333335,
 0.16111111111111112,
 0.7777777777777778,
 0.34444444444444444,
 0.11388888888888889,
 0.5944444444444444,
 0.8416666666666667,
 0.4638888888888889,
 0.5222222222222223,
 0.5944444444444444,
 0.825,
 0.15833333333333333,
 0.7194444444444444,
 0.34444444444444444,
 0.45555555555555555,
 0.4888888888888889,
 0.5944444444444444,
 0.39444444444444443,
 0.9222222222222223,
 0.7305555555555555,
 0.7472222222222222,
 0.3416666666666667,
 0.09166666666666666,
 0.22777777777777777,
 0.1361111111111111,
 0.325,
 0.575,
 0.04722222222222222,
 0.3416666666666667,
 0.28888888888888886,
 0.175,
 0.002777777777777778,
 0.002777777777777778,
 0.005555555555555556,
 0.002777777777777778,
 0.002777777777777778,
 0.002777777777777778,
 0.005555555555555556,
 0.002777777777777778,
 0.002777777777777778,
 0.00277777777777777

In [10]:
# Step6

from sklearn.metrics import roc_curve, auc

# Compute the ROC curve
fpr, tpr, thresholds = roc_curve(test_labels, test_p_values)

# Compute the AUC
roc_auc = auc(fpr, tpr)

# Print the AUC
print("AUC:", roc_auc)

# Convert test_p_values to a numpy array
test_p_values = np.array(test_p_values)

# selecting a confidence level of 0.95
confidence_level = 0.95

# Compute the threshold for strangeness based on the confidence level

threshold = 1 - confidence_level

# Compute TP and FP
true_positives = sum((test_p_values < threshold) & (test_labels == 0))
false_positives = sum((test_p_values < threshold) & (test_labels == 1))

# Print TP and FP
print("True Positives:", true_positives)
print("False Positives:", false_positives)


AUC: 1.0
True Positives: 10
False Positives: 1


In [11]:
# Step8

# Load data from all modes (A, B, C, D, and M)
test_data_min2,test_data_label = load_data('Test\TestWT')
# Extract features using FFT
test_data_min2_fft = extract_features(test_data_min2)

# Use the Autoencoder to predict the reconstructed data for the test set
test_data_reconstructed_min2 = autoencoder.predict(test_data_min2_fft)

# Calculate the reconstruction error for each data point
reconstruction_errors_test_min2 = np.mean((test_data_min2_fft - test_data_reconstructed_min2)**2, axis=1)

# Form a list of strangeness values for the test set
strangeness_values_test_min2 = reconstruction_errors_test_min2.tolist()

# calculate p value
test_min2_p_values = []
N = len(strangeness_training_list)

for test_strangeness_min2 in strangeness_values_test_min2:
    # Find the index of the test strangeness in the sorted training list
    index = np.searchsorted(strangeness_training_list, test_strangeness_min2)
    
    # Calculate the number of measures in the training list that are higher or equal to the test strangeness
    b = N - index
    
    # Calculate the p-value
    p_value_min2 = (b + 1) / (N + 1)
    
    # Append the p-value to the list of test p-values
    test_min2_p_values.append(p_value_min2)



In [12]:
# Save the p-values to a text file
np.savetxt('p_values_min2.txt', test_min2_p_values, fmt='%.6f')
