<div style="text-align: center; font-size: 40px;">
    <b>Final Project</b>
    <br>
    Jarry Guillaume
    <br>
    
</div>


In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff
import pandas as pd
from kmedoids import KMedoids
import os 
import glob

For this project, we will use the categorical Datasets from the ADRepository-Anomaly-detection-datasets github repository. It is available here : 

- https://github.com/GuansongPang/ADRepository-Anomaly-detection-datasets?tab=readme-ov-fil

Since our article is focused on aonmaly detection for discrete timeseries, these dataset will allow us to deploy some of the techniques showcased in the article. Let's start ! 

In [18]:
folder_path = "ADRepository-Anomaly-detection-datasets/categorical data/"
datasets = []

for filepath in glob.glob(os.path.join(folder_path, "*")):
    try: 
        data, meta = arff.loadarff(filepath)
        datasets.append((data, meta))
    except: 
        print(f"Error while parsing file : {filepath}")

Error while parsing file : ADRepository-Anomaly-detection-datasets/categorical data\census-income-full-nominal.tar.xz
Error while parsing file : ADRepository-Anomaly-detection-datasets/categorical data\covertype_nominal_4vs123567.tar.xz
Error while parsing file : ADRepository-Anomaly-detection-datasets/categorical data\Reuters-corn-100.arff
Error while parsing file : ADRepository-Anomaly-detection-datasets/categorical data\w7a-libsvm-nonsparse.tar.xz


## Intro : Generating Synthetic Data :

Since the data we tried to find online for synthetic timeseries was rarely labeled, we propose to generate some synthetic data so that we can test and implement as many algorithm for our discreet anomaly detection library. We can then test our algorithm on some real, less labeled data.

### Markovian models :     

This class will generate synthetic data that creates Markovian Discreet sequences.

In [3]:
ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

In [None]:
class MarkovianSequences: 
    def __init__(self, transition_matrix, hidden_matrix=None, n_sequences=100, sequence_length=50): 
        self.transition_matrix = transition_matrix
        self.n_symbols = len(transition_matrix)
        self.symbols = ALPHABET[:self.n_symbols]
        self.sequence_length = sequence_length
        self.n_sequences = n_sequences
        self.hidden_matrix = hidden_matrix
        self.check_probabilities()
    
    def check_probabilities(self): 
        for i in range(self.transition_matrix.shape[0]):
            if not np.isclose(np.sum(self.transition_matrix[i]), 1.0):
                raise ValueError(f"Row {i} of transition_matrix does not sum to 1.")

        if self.hidden_matrix is not None:
            if self.hidden_matrix.shape[0] != self.hidden_matrix.shape[1]:
                raise ValueError("hidden_matrix must be square.")
            for i in range(self.hidden_matrix.shape[0]):
                if not np.isclose(np.sum(self.hidden_matrix[i]), 1.0):
                    raise ValueError(f"Row {i} of hidden_matrix does not sum to 1.")
            self.n_hidden = self.hidden_matrix.shape[0]
            if self.n_hidden != self.transition_matrix.shape[0]:
                raise ValueError("Number of hidden states does not match the dimension of transition_matrix.")
            
    def generate_sequence(self, initial_state=None): 
        if initial_state is None:
            current_state = np.random.choice(self.n_symbols)
        else:
            current_state = initial_state

        sequence = [self.symbols[current_state]]
        for _ in range(self.sequence_length - 1):
            next_state = np.random.choice(self.n_symbols, p=self.transition_matrix[current_state])
            sequence.append(self.symbols[next_state])
            current_state = next_state
        return sequence
    
    def generate_hidden_sequence(self, initial_state=None):       
        if initial_state is None:
            current_hidden_state = np.random.choice(self.n_hidden)
        else:
            current_hidden_state = initial_state

        current_symbol = np.random.choice(self.n_symbols, p=self.transition_matrix[current_hidden_state])
        sequence = [self.symbols[current_symbol]]

        for _ in range(self.sequence_length - 1):
            next_hidden_state = np.random.choice(self.n_hidden, p=self.hidden_matrix[current_hidden_state])
            emitted_symbol = np.random.choice(self.n_symbols, p=self.transition_matrix[next_hidden_state])
            sequence.append(self.symbols[emitted_symbol])
            current_hidden_state = next_hidden_state

        return sequence

    def generate_all_sequences(self):
        all_seqs = []
        for _ in range(self.n_sequences):
            if self.hidden_matrix is not None: 
                seq = self.generate_hidden_sequence()
            else: 
                seq = self.generate_sequence()
            all_seqs.append(seq)
        return all_seqs

In [19]:
class MarkovianDatasetGenerator: 
    def __init__(self, transition_matrices, hidden_matrices, n_sequences=100, sequence_length=50): 
        self.transition_matrices = transition_matrices
        self.hidden_matrices = hidden_matrices 
        self.n_sequences = n_sequences
        self.sequence_length = sequence_length
        self.generators = self.init_transform()
    
    def init_transform(self):
        self.generators = []
        self.dataset = []
        for transition_matrix, hidden_matrix in zip(self.transition_matrices, self.hidden_matrices): 
            generator = MarkovianSequences(transition_matrix, 
                                           hidden_matrix=hidden_matrix, 
                                           n_sequences=self.n_sequences, 
                                           sequence_length=self.sequence_length) 
            self.generators.append(generator)
        return self.generators

    def generate(self): 
        self.dataset = []
        for generator in self.generators: 
            sequences = generator.generate_all_sequences()
            self.dataset.extend(sequences)

        return self.dataset

And now let us generate a dataset mixing a hidden Markow model and two markow models and let us wrap them up into the same dataset. We will also add an anomaly dataset, which will be another Markov model, with different probabilities.

In [None]:
transition_kernel_example = np.array([
    [0.7, 0.2, 0.1],
    [0.1, 0.7, 0.2],
    [0.2, 0.1, 0.7]
])

hidden_matrix = np.array([
    [0.7, 0.2, 0.1],
    [0.1, 0.8, 0.1],
    [0.2, 0.2, 0.6]
])

transition_matrix_example = np.array([
    [0.6, 0.3, 0.1],  # Emission distribution from hidden state 0
    [0.2, 0.5, 0.3],  # Emission distribution from hidden state 1
    [0.1, 0.2, 0.7]   # Emission distribution from hidden state 2
])

transition_kernel_example = np.array([
    [0.7, 0.2, 0.1],
    [0.1, 0.7, 0.2],
    [0.2, 0.1, 0.7]
])

transition_matrices = []
hidden_matrices = [hidden_matrix, ]

generator = MarkovianDatasetGenerator(transition_matrices, hidden_matrices)
dataset = generator.generate()

# I) Kernell-Based Techniques 

In [7]:
class KernellBase: 
    def __init__(self, dataset, similarity_metric): 
        self.dataset = dataset
        self.similarity_metric = similarity_metric
        self.similarity_matrix = None

    def compute_similarity_matrix(self, n_clusters): 
        n = len(self.dataset)
        self.similarity_matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(i, n):
                sim = self.similarity_metric(self.dataset[i], self.dataset[j])
                self.similarity_matrix[i,j] = sim
                self.similarity_matrix[j,i] = sim  # Symmetric
        
        self.distance_matrix = 1 - self.similarity_matrix
        kmedoids = KMedoids(n_clusters=n_clusters, random_state=42)
        kmedoids.fit(self.distance_matrix)
        self.medoids = self.dataset[kmedoids.cluster_centers_]
        return self.similarity_matrix

    def knearest_predict(self, test_sequence, k_nearest=5):
        similarities = []
        for sequence in self.dataset:  
            similarities.append(self.similarity_metric(test_sequence, sequence))
        
        similarities.sort(reverse=True)
        anomaly_score = 1 / similarities[k_nearest]
        return anomaly_score

    def clustering_predict(self, test_sequence, n_clusters=5):
        if self.similarity_matrix is None: 
            self.compute_similarity_matrix(n_clusters)

        max_similarity = 0
        for medoid in self.medoids: 
            max_similarity = max(max_similarity, self.similarity_metric(test_sequence, medoid))
        
        return 1 / max_similarity       

Now let us try our Kernell based methods with the longest common sequence kernell suggested in the article.

In [11]:
def LCS_length(seq1, seq2):
    len1, len2 = len(seq1), len(seq2)
    dp = [[0]*(len2+1) for _ in range(len1+1)]
    for i in range(1, len1+1):
        for j in range(1, len2+1):
            if seq1[i-1] == seq2[j-1]:
                dp[i][j] = dp[i-1][j-1] + 1
            else:
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
    return dp[len1][len2]

def nLCS(seq1, seq2):
    lcs = LCS_length(seq1, seq2)
    return lcs / ( (len(seq1)*len(seq2))**0.5 )

In [None]:
kernell_based = KernellBase(dataset, nLCS)
kernell_based.compute_similarity_matrix()

# II) Window Based Techniques :

In [None]:
class WindowBased: 
    def __init__(self, window_length): 
        self.window_length = window_length