<div style="text-align: center; font-size: 40px;">
    <b>Final Project</b>
    <br>
    Mouad ID SOUGOU
    <br>
    
</div>


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import arff
import pandas as pd
from MarkovianTechniques import FixedMarkovianBased, VariableMarkovianBased, SparseMarkovRIPPER, SparseMarkovTransducer
import os 
import glob

## Intro : Generating Synthetic Data :

Since the data we tried to find online for synthetic timeseries was rarely labeled, we propose to generate some synthetic data so that we can test and implement as many algorithm for our discreet anomaly detection library. We can then test our algorithm on some real, less labeled data.

### Markovian models :     

This class will generate synthetic data that creates Markovian Discreet sequences.

In [3]:
ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

In [4]:
class MarkovianSequences: 
    def __init__(self, transition_matrix, hidden_matrix=None, n_sequences=100, sequence_length=50): 
        self.transition_matrix = transition_matrix
        self.n_symbols = len(transition_matrix)
        self.symbols = ALPHABET[:self.n_symbols]
        self.sequence_length = sequence_length
        self.n_sequences = n_sequences
        self.hidden_matrix = hidden_matrix
        self.check_probabilities()
    
    def check_probabilities(self): 
        for i in range(self.transition_matrix.shape[0]):
            if not np.isclose(np.sum(self.transition_matrix[i]), 1.0):
                raise ValueError(f"Row {i} of transition_matrix does not sum to 1.")

        if self.hidden_matrix is not None:
            if self.hidden_matrix.shape[0] != self.hidden_matrix.shape[1]:
                raise ValueError("hidden_matrix must be square.")
            for i in range(self.hidden_matrix.shape[0]):
                if not np.isclose(np.sum(self.hidden_matrix[i]), 1.0):
                    raise ValueError(f"Row {i} of hidden_matrix does not sum to 1.")
            self.n_hidden = self.hidden_matrix.shape[0]
            if self.n_hidden != self.transition_matrix.shape[0]:
                raise ValueError("Number of hidden states does not match the dimension of transition_matrix.")
            
    def generate_sequence(self, initial_state=None): 
        if initial_state is None:
            current_state = np.random.choice(self.n_symbols)
        else:
            current_state = initial_state

        sequence = [self.symbols[current_state]]
        for _ in range(self.sequence_length - 1):
            next_state = np.random.choice(self.n_symbols, p=self.transition_matrix[current_state])
            sequence.append(self.symbols[next_state])
            current_state = next_state
        return sequence
    
    def generate_hidden_sequence(self, initial_state=None):       
        if initial_state is None:
            current_hidden_state = np.random.choice(self.n_hidden)
        else:
            current_hidden_state = initial_state

        current_symbol = np.random.choice(self.n_symbols, p=self.transition_matrix[current_hidden_state])
        sequence = [self.symbols[current_symbol]]

        for _ in range(self.sequence_length - 1):
            next_hidden_state = np.random.choice(self.n_hidden, p=self.hidden_matrix[current_hidden_state])
            emitted_symbol = np.random.choice(self.n_symbols, p=self.transition_matrix[next_hidden_state])
            sequence.append(self.symbols[emitted_symbol])
            current_hidden_state = next_hidden_state

        return sequence

    def generate_all_sequences(self):
        all_seqs = []
        for _ in range(self.n_sequences):
            if self.hidden_matrix is not None: 
                seq = self.generate_hidden_sequence()
            else: 
                seq = self.generate_sequence()
            all_seqs.append(seq)
        return all_seqs

In [5]:
class MarkovianDatasetGenerator: 
    def __init__(self, transition_matrices, hidden_matrices, n_sequences=100, sequence_length=50): 
        self.transition_matrices = transition_matrices
        self.hidden_matrices = hidden_matrices 
        self.n_sequences = n_sequences
        self.sequence_length = sequence_length
        self.generators = self.init_transform()
    
    def init_transform(self):
        self.generators = []
        self.dataset = []
        for transition_matrix, hidden_matrix in zip(self.transition_matrices, self.hidden_matrices): 
            generator = MarkovianSequences(transition_matrix, 
                                           hidden_matrix=hidden_matrix, 
                                           n_sequences=self.n_sequences, 
                                           sequence_length=self.sequence_length) 
            self.generators.append(generator)
        return self.generators

    def generate(self): 
        self.dataset = []
        for generator in self.generators: 
            sequences = generator.generate_all_sequences()
            self.dataset.extend(sequences)

        return self.dataset

In [6]:
# We first generate synthetic data from the given transition matrix
transition_matrix = np.array([
    [0.1, 0.6, 0.3],
    [0.4, 0.4, 0.2],
    [0.2, 0.3, 0.5]
])
markov_generator = MarkovianSequences(transition_matrix, n_sequences=100, sequence_length=50)
train_sequences = markov_generator.generate_all_sequences()

# III) Markovian Techniques

## 1) Fixed Markovian Techniques 

This technique “learns” the conditional probability of occurrence of a given symbol using a given fixed history $k$.

In [7]:
# Initialize and train the Markovian anomaly detector
detector_fixed = FixedMarkovianBased(k=3)  # History length k = 3
detector_fixed.train(train_sequences)

In [8]:
test_sequence = train_sequences[0][20:27]
print(f"\nTest Sequence: {test_sequence}")

# Compute the anomaly score for the test sequence
anomaly_score = detector_fixed.compute_anomaly_score(test_sequence)
print(f"Anomaly Score for the Test Sequence: {anomaly_score}")  


Test Sequence: ['B', 'C', 'C', 'C', 'C', 'C', 'A']
Anomaly Score for the Test Sequence: 68.65714865662405


In [9]:
test_sequence = ['A', 'B', 'A', 'C', 'B', 'A', 'A']
print(f"\nTest Sequence: {test_sequence}")

# Compute the anomaly score for the test sequence
anomaly_score = detector_fixed.compute_anomaly_score(test_sequence)
print(f"Anomaly Score for the Test Sequence: {anomaly_score}")


Test Sequence: ['A', 'B', 'A', 'C', 'B', 'A', 'A']
Anomaly Score for the Test Sequence: 786.2371221940447


We first try it on a sequence from the train set, and then on a different one. The higher anomaly scores indicate greater anomaly likelihood

## 2) Variable Markovian Techniques

This technique addresses a key limitation of fixed Markov models. Instead of always using a context of size 
$k-1$, it dynamically adjusts the size of the context. Variable Markovian techniques solve the problem of rare contexts which is a disadvantage of the fixed techniques, if a context is rare in the training data, its probability will be very low.

In [10]:
detector_variable = VariableMarkovianBased(max_depth=3)
detector_variable.train(train_sequences)


In [11]:
test_sequence = train_sequences[0][20:27]
print(f"\nTest Sequence: {test_sequence}")

# Compute the anomaly score for the test sequence
anomaly_score = detector_variable.compute_anomaly_score(test_sequence)
print(f"Anomaly Score for the Test Sequence: {anomaly_score}")  


Test Sequence: ['B', 'C', 'C', 'C', 'C', 'C', 'A']
Anomaly Score for the Test Sequence: 5.992480162952356


In [12]:
# Test sequence
test_sequence = ['A', 'B', 'C', 'D', 'E', 'F']
anomaly_score = detector_variable.compute_anomaly_score(test_sequence)
print(f"Anomaly Score for Test Sequence {test_sequence}: {anomaly_score}")

Anomaly Score for Test Sequence ['A', 'B', 'C', 'D', 'E', 'F']: 22.853816272508112


## 3) Sparse Markovian Techniques

This technique aim to increase flexibility compared to Fixed and Variable Markovian Techniques. The key difference is that instead of relying on **contiguous** and immediately preceding symbols as the context, Sparse Markovian Techniques allow for **gaps** or wildcards in the context

#### Sparse Markovian Transducer technique
The SMT method builds a sparse suffix tree where contexts can have wildcards and uses this tree to compute the probability of a sequence using context matching and backoff to shorter contexts when necessary

In [None]:

smt = SparseMarkovTransducer(max_depth=3, wildcard_positions=[1])  # Wildcard at position 1
for sequence in train_sequences:
    smt.insert(sequence)

In [28]:
test_sequence = train_sequences[0][30:37]
print(f"\nTest Sequence: {test_sequence}")

probability = smt.compute_sequence_probability(test_sequence)
anomaly_score = smt.compute_anomaly_score(test_sequence)
print(f"Probability of Test Sequence: {probability}")
print(f"Anomaly Score of Test Sequence: {anomaly_score}")


Test Sequence: ['B', 'B', 'B', 'C', 'C', 'C', 'A']
Probability of Test Sequence: 0.0014916014206201528
Anomaly Score of Test Sequence: 6.507904957246444


In [30]:
test_sequence = ['A', 'A', 'B', 'B', 'B', 'A', 'A']
print(f"\nTest Sequence: {test_sequence}")

probability = smt.compute_sequence_probability(test_sequence)
anomaly_score = smt.compute_anomaly_score(test_sequence)
print(f"Probability of Test Sequence: {probability}")
print(f"Anomaly Score of Test Sequence: {anomaly_score}")


Test Sequence: ['A', 'A', 'B', 'B', 'B', 'A', 'A']
Probability of Test Sequence: 0.0003029588651361972
Anomaly Score of Test Sequence: 8.101913520297353


#### Rule Based technique (**R**epeated **I**ncremental **P**runing to **P**roduce **E**rror **R**eduction)
The RIPPER method extracts (context, next symbol) pairs from training sequences, encodes the symbols as numeric features, and trains a decision tree-based classifier to predict the next symbol for a given context

In [31]:
print("\n--- RIPPER-Based Sparse Markov ---")
ripper = SparseMarkovRIPPER(max_depth=3)
ripper.train(train_sequences)


--- RIPPER-Based Sparse Markov ---


In [33]:
test_sequence = train_sequences[0][30:37]
print(f"\nTest Sequence: {test_sequence}")

anomaly_score_ripper = ripper.compute_anomaly_score(test_sequence)
print(f"Anomaly Score of Test Sequence: {anomaly_score_ripper}")


Test Sequence: ['B', 'B', 'B', 'C', 'C', 'C', 'A']
Anomaly Score of Test Sequence: 14.026231589279927


In [38]:
test_sequence = ['A', 'A', 'A', 'A', 'B', 'A', 'A']
print(f"\nTest Sequence: {test_sequence}")

anomaly_score_ripper = ripper.compute_anomaly_score(test_sequence)
print(f"Test Sequence: {test_sequence}")
print(f"Anomaly Score of Test Sequence: {anomaly_score_ripper}")


Test Sequence: ['A', 'A', 'A', 'A', 'B', 'A', 'A']
Test Sequence: ['A', 'A', 'A', 'A', 'B', 'A', 'A']
Anomaly Score of Test Sequence: 20.82862635260424
