<a href="https://colab.research.google.com/github/mitsouma/IOT_Anomalies_Detection_System_Based_on_Tensor_Decomposition/blob/main/Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip install -U tensorly



In [28]:
import numpy as np
import pandas as pd
from tensorly.decomposition import parafac
import tensorly as tl
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorly import unfold, fold
from tensorly.tenalg import khatri_rao
from numpy.linalg import lstsq
from tensorly.cp_tensor import cp_to_tensor

In [26]:
class AnomalyDetectionPipeline:
    def __init__(self, config=None):
        self.config = config or {
            'tensor_rank': 10,        # Rang pour la décomposition CP
            'window_size': 300,       # Fenêtre temporelle en secondes
            'learning_rate': 0.001,
            'batch_size': 32,
            'epochs': 50
        }
        self.model = None
        self.scaler = StandardScaler()

    def create_graph_tensor(self, df_normal, df_train, window_size=None):
        """
        Crée deux tenseurs (normal et train) avec les mêmes dimensions

        Parameters:
        -----------
        df_normal : pd.DataFrame
            DataFrame contenant les données normales
        df_train : pd.DataFrame
            DataFrame contenant les données d'entraînement
        window_size : int, optional
            Taille de la fenêtre temporelle en secondes

        Returns:
        --------
        tensor_data : dict
            Dictionnaire contenant les tenseurs et les mappings
        """
        if window_size is None:
            window_size = self.config['window_size']

        # Combiner les dataframes pour extraire toutes les valeurs uniques
        combined_df = pd.concat([df_normal, df_train], ignore_index=True)

        # Extract unique source and destination IPs from both dataframes
        src_ips = combined_df['Src IP'].unique()
        dst_ips = combined_df['Dst IP'].unique()

        # Create mappings for src_ip and dst_ip to indices
        features = ['count', 'bytes', 'duration', 'packets']
        src_ip_to_idx = {ip: idx for idx, ip in enumerate(src_ips)}
        dst_ip_to_idx = {ip: idx for idx, ip in enumerate(dst_ips)}
        idx_to_feature = {idx: feature for idx, feature in enumerate(features)}

        # Create time windows for both dataframes
        df_normal['time_window'] = df_normal['relative_time'].astype(np.int64) // (10**9 * window_size)
        df_train['time_window'] = df_train['relative_time'].astype(np.int64) // (10**9 * window_size)

        # Get all unique time windows from both dataframes
        all_time_windows = sorted(pd.concat([
            df_normal['time_window'], df_train['time_window']
        ]).unique())

        # Create adjacency tensors [time_windows, Src IP, Dst IP, features]
        tensor_normal = np.zeros((len(all_time_windows), len(src_ips), len(dst_ips), 4), dtype=np.float32)
        tensor_train = np.zeros((len(all_time_windows), len(src_ips), len(dst_ips), 4), dtype=np.float32)

        # Define a function to fill a tensor
        def fill_tensor(df, tensor):
            time_window_to_idx = {tw: idx for idx, tw in enumerate(all_time_windows)}

            # Group by time window, source and destination
            for _, group in df.groupby(['time_window', 'Src IP', 'Dst IP']):
                t_idx = time_window_to_idx[group['time_window'].iloc[0]]
                src_idx = src_ip_to_idx[group['Src IP'].iloc[0]]
                dst_idx = dst_ip_to_idx[group['Dst IP'].iloc[0]]

                # Aggregated features
                tensor[t_idx, src_idx, dst_idx, 0] = len(group)  # count
                tensor[t_idx, src_idx, dst_idx, 1] = group['Flow Bytes/s'].sum()  # bytes
                tensor[t_idx, src_idx, dst_idx, 2] = group['Flow Duration'].mean()  # duration
                tensor[t_idx, src_idx, dst_idx, 3] = group['Flow Packets/s'].sum()  # packets

        # Fill both tensors
        fill_tensor(df_normal, tensor_normal)
        fill_tensor(df_train, tensor_train)

        return {
            'normal': tensor_normal,
            'train': tensor_train,
            'src_ip_to_idx': src_ip_to_idx,
            'dst_ip_to_idx': dst_ip_to_idx,
            'time_windows': all_time_windows,
            'mappings': {
                'Src_IP': src_ip_to_idx,
                'Dst_IP': dst_ip_to_idx,
                'features_names':idx_to_feature
            }
        }

    def tensor_decomposition(self, tensor_normal):
        """
        Étape 2: Décomposition du tenseur normal et extraction des composantes de faible rang

        Parameters:
        -----------
        tensor_normal : np.ndarray
            Tenseur normal à décomposer

        Returns:
        --------
        decomposition_result : dict
            Résultat de la décomposition (tenseur de faible rang et facteurs)
        """
        # Utilise la décomposition CP pour obtenir un modèle de tenseur de faible rang
        rank = self.config['tensor_rank']
        factors = parafac(tensor_normal, rank=rank, init='random', tol=1e-6)

        # Pour un tenseur 4D (Time × Src_IP × Dst_IP × Feature)
        A, B, C, D = factors

        # Reconstruire le tenseur avec les facteurs
        low_rank_tensor = cp_to_tensor((None, [A, B, C, D]))

        return {
            'low_rank_tensor': low_rank_tensor,
            'factors': factors,
            'rank': rank
        }

    def normal_space_projection(self, tensor_train, decomposition_result):
        """
        Étape 3: Projection du tenseur d'entraînement sur l'espace normal

        Parameters:
        -----------
        tensor_train : np.ndarray
            Tenseur d'entraînement à projeter
        decomposition_result : dict
            Résultat de la décomposition du tenseur normal

        Returns:
        --------
        residuals : np.ndarray
            Tenseur des résidus (anomalies potentielles)
        """
        # Extraire les facteurs du tenseur normal
        A, B, C, D = decomposition_result['factors']
        rank = decomposition_result['rank']

        # Nombre de tranches temporelles dans le tenseur d'entraînement
        new_I = tensor_train.shape[0]

        # Initialiser le nouveau facteur A pour le tenseur d'entraînement
        new_A = np.zeros((new_I, rank))

        # Pour chaque tranche temporelle du tenseur d'entraînement
        for k in range(new_I):
            # Déplier le tenseur dans son premier mode (temps)
            # Reshape: 1 × (Src_IP × Dst_IP × Feature)
            T_k_unfolded = tensor_train[k].reshape(1, -1)

            # Calculer le produit de Khatri-Rao pour B, C, et D
            # (D ⊙ C ⊙ B)
            kr_product = khatri_rao([D, khatri_rao([C, B])])

            # Résoudre pour trouver le nouveau vecteur a(k)
            # a(k) = T_k_unfolded × ((D ⊙ C ⊙ B)ᵀ)†
            new_A[k] = T_k_unfolded @ np.linalg.pinv(kr_product.T)

        # Reconstruire le nouveau modèle avec le nouveau facteur A
        V_hat = cp_to_tensor((None, [new_A, B, C, D]))

        # Calculer les résidus pour le tenseur d'entraînement
        residuals = tensor_train - V_hat

        return residuals

    def aggregation_scoring(self, residuals, mappings, time_windows): # Corrected indentation
        """
        Step 4: Aggregate anomaly scores by fixing Dst IP, time window, and feature (but not Src IP)

        Parameters:
        -----------
        residuals : np.ndarray
            Residual tensor of shape (Time, Src IP, Dst IP, Feature)
        mappings : dict
            Mappings of dimension indices to real values (e.g., IP addresses)
        time_windows : list
            List of time windows corresponding to the time axis

        Returns:
        --------
        scores_df : pd.DataFrame
            DataFrame containing aggregated anomaly scores
        """

        scores_data = []

        # Reverse mappings
        idx_to_dst_ip = {idx: ip for ip, idx in mappings['Dst_IP'].items()}
        #Fix: Changed  idx_to_feature = {idx: feat for feat, idx in mappings['Feature'].items()} to  idx_to_feature = {idx: feat for feat, idx in mappings['features_names'].items()}
        idx_to_feature = {idx: feat for feat, idx in mappings['features_names'].items()}

        time_dim, src_dim, dst_dim, feat_dim = residuals.shape

        # Loop over time, dst_ip, and feature — not src_ip
        for t in range(time_dim):
            time_window = time_windows[t]
            for d in range(dst_dim):
                dst_ip = idx_to_dst_ip[d]
                for f in range(feat_dim):
                    feature_name = idx_to_feature[f]

                    # Slice over all Src IPs for this (time, dst, feature)
                    values = residuals[t, :, d, f]  # shape: (Src IP,)

                    # Compute aggregation metrics
                    scores_data.append({
                        'Time_window': time_window,
                        'Dst_IP': dst_ip,
                        'Feature': feature_name,
                        'mean_abs_residual': np.mean(np.abs(values)),
                        'max_abs_residual': np.max(np.abs(values)),
                        'std_residual': np.std(values),
                        'sum_squared_residual': np.sum(values ** 2)
                    })

        # Convert to DataFrame
        scores_df = pd.DataFrame(scores_data)
        return scores_df

    def build_deep_learning_model(self, score_df):
      """
      Step 5: Build a Deep Learning model using anomaly scores per Dst IP.

      Parameters:
      -----------
      score_df : pd.DataFrame
          DataFrame containing per-feature anomaly scores by Dst IP (and optionally labels).

      Returns:
      --------
      dict
          Dictionary with the trained model, input features, and learning mode.
      """
      # Encode 'Dst IP'
      score_df = pd.get_dummies(data=score_df,columns=['Dst IP'], dtype=int)
      # Determine mode and prepare labels if needed
      if 'label' in score_df.columns:
          y = score_df['label'].values
          mode = 'supervised'
      else:
          y = None
          mode = 'unsupervised'

      # Normalize input features
      X_scaled = self.scaler.fit_transform(score_df.drop(columns=['label']))

      # Build and train model
      if mode == 'supervised':
          model = tf.keras.Sequential([
              tf.keras.layers.Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
              tf.keras.layers.Dropout(0.2),
              tf.keras.layers.Dense(32, activation='relu'),
              tf.keras.layers.Dropout(0.2),
              tf.keras.layers.Dense(16, activation='relu'),
              tf.keras.layers.Dense(1, activation='sigmoid')
          ])
          model.compile(
              optimizer=tf.keras.optimizers.Adam(learning_rate=self.config['learning_rate']),
              loss='binary_crossentropy',
              metrics=['accuracy']
          )
          model.fit(
              X_scaled, y,
              batch_size=self.config['batch_size'],
              epochs=self.config['epochs'],
              validation_split=0.2,
              verbose=1
          )
      else:
          input_dim = X_scaled.shape[1]
          encoding_dim = 8

          input_layer = tf.keras.layers.Input(shape=(input_dim,))
          encoded = tf.keras.layers.Dense(32, activation='relu')(input_layer)
          encoded = tf.keras.layers.Dense(16, activation='relu')(encoded)
          encoded = tf.keras.layers.Dense(encoding_dim, activation='relu')(encoded)

          decoded = tf.keras.layers.Dense(16, activation='relu')(encoded)
          decoded = tf.keras.layers.Dense(32, activation='relu')(decoded)
          decoded = tf.keras.layers.Dense(input_dim, activation='sigmoid')(decoded)

          model = tf.keras.Model(inputs=input_layer, outputs=decoded)
          model.compile(
              optimizer=tf.keras.optimizers.Adam(learning_rate=self.config['learning_rate']),
              loss='mse'
          )
          model.fit(
              X_scaled, X_scaled,
              batch_size=self.config['batch_size'],
              epochs=self.config['epochs'],
              validation_split=0.2,
              verbose=1
          )

          self.model = model
          return {
                'model': model,
                'features': X_scaled,
                'mode': mode
            }



    def run_pipeline(self, df_normal, df_train,rank, with_labels=False):
        """
        Exécution complète du pipeline

        Parameters:
        -----------
        df_normal : pd.DataFrame
            DataFrame contenant les données normales
        df_train : pd.DataFrame
            DataFrame contenant les données d'entraînement
        with_labels : bool, optional
            Indique si les données d'entraînement contiennent des labels

        Returns:
        --------
        results : dict
            Résultats du pipeline
        """
        # Étape 1: Construction des tenseurs
        tensor_data = self.create_graph_tensor(df_normal, df_train)
        tensor_normal = tensor_data['normal']
        tensor_train = tensor_data['train']
        mappings = tensor_data['mappings']
        time_windows = tensor_data['time_windows']

        # Étape 2: Décomposition du tenseur normal
        rank=self.config['tensor_rank']
        decomposition_result = self.tensor_decomposition(tensor_normal)

        # Étape 3: Projection du tenseur d'entraînement sur l'espace normal
        residuals = self.normal_space_projection(tensor_train, decomposition_result)

        # Étape 4: Agrégation des scores
        scores_df = self.aggregation_scoring(residuals, mappings, time_windows)

        # Si les données d'entraînement contiennent des labels, les ajouter
        if with_labels and 'Label' in df_train.columns:
            # Créer un mapping pour les labels
            label_map = {}
            for _, row in df_train.iterrows():
                time_window = row['relative_time'].astype(np.int64) // (10**9 * self.config['window_size'])
                src_ip = row['Src IP']
                dst_ip = row['Dst IP']
                key = (time_window, src_ip, dst_ip)
                # Supposons que 1 = attaque, 0 = normal
                label_map[key] = 1 if row['Label'] == 'Attack' else 0

            # Ajouter les labels au DataFrame des scores
            def get_label(row):
                key = (row['Time_window'], row['Src_IP'], row['Dst_IP'])
                return label_map.get(key, 0)  # 0 pour normal par défaut

            scores_df['label'] = scores_df.apply(get_label, axis=1)

        # Étape 5: Construction et entraînement du modèle de Deep Learning
        model_result = self.build_deep_learning_model(scores_df)

        return {
            'tensor_data': tensor_data,
            'decomposition_result': decomposition_result,
            'residuals': residuals,
            'scores_df': scores_df,
            'model_result': model_result
        }

# Exemple d'utilisation:
# config = {
#     'tensor_rank': 10,
#     'window_size': 300,
#     'learning_rate': 0.001,
#     'batch_size': 32,
#     'epochs': 50
# }
# pipeline = AnomalyDetectionPipeline(config)
# results = pipeline.run_pipeline(df_normal, df_train, with_labels=True)