# Company-Specific Autoencoders for Anomaly Detection on Scraped PDF Data

This notebook implements and evaluates company-specific autoencoder models for detecting anomalous documents within a dataset of PDFs scraped from Scribd, pertaining to various Algerian companies.

**Key Steps and Content:**

* **Data Loading & Preprocessing:** Loads pre-computed SBERT embeddings for documents of multiple Algerian companies. For each company, its own documents are treated as "normal" data.
* **Company-Specific Model Training:** Iterates through each company, training a dedicated autoencoder model using only that company's "normal" document embeddings. The autoencoder learns to reconstruct these normal embeddings with low error.
* **Anomaly Injection for Evaluation:** For each company-specific model, "anomalous" documents are simulated by injecting a small number of documents from other companies into its validation and test sets.
* **Threshold Optimization & Evaluation:** A Mean Squared Error (MSE) reconstruction threshold is determined for each model by maximizing the F1-score on its validation set. The model is then evaluated on its test set using metrics like F1-score, precision, recall, and a confusion matrix.
* **Results Aggregation:** The performance metrics (threshold, F1, precision, recall, confusion matrix) for each company-specific model are collected and typically saved or displayed, forming the basis for Table 2.7 in the thesis.

AADL

In [13]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_AADL.npy'

filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    
# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    # 'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'   
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))  ])

# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_AADL.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.2516 - val_loss: 0.2511
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - loss: 0.2512 - val_loss: 0.2506
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - loss: 0.2506 - val_loss: 0.2498
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step - loss: 0.2498 - val_loss: 0.2486
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - loss: 0.2486 - val_loss: 0.2470
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - loss: 0.2469 - val_loss: 0.2447
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - loss: 0.2445 - val_loss:

air algerie

In [12]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Air_Algérie.npy'

filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    
# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    # 'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'   
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))  ])

# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Air_Algérie.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2515 - val_loss: 0.2509
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - loss: 0.2510 - val_loss: 0.2501
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - loss: 0.2502 - val_loss: 0.2490
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step - loss: 0.2490 - val_loss: 0.2473
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - loss: 0.2472 - val_loss: 0.2451
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step - loss: 0.2448 - val_loss: 0.2421
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - loss: 0.2415 - va

algerie poste

In [11]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Algérie_Poste.npy'

filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    
# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    # 'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'   
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))  ])

# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Algérie_Poste.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2516 - val_loss: 0.2511
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - loss: 0.2511 - val_loss: 0.2504
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - loss: 0.2504 - val_loss: 0.2495
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - loss: 0.2495 - val_loss: 0.2481
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - loss: 0.2480 - val_loss: 0.2461
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step - loss: 0.2459 - val_loss: 0.2433
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 0.2431 - 

Algérie_Télécom

In [15]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Algérie_Télécom.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    # # Skip empty files or corrupted embeddings
    # if embeddings.size == 0:
    #     print(f"Skipping {filename} (empty)")
    #     continue

# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    # 'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])
# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=['Normal', 'Anomaly']))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Algérie_Télécom.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.2516 - val_loss: 0.2510
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - loss: 0.2510 - val_loss: 0.2502
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - loss: 0.2502 - val_loss: 0.2490
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - loss: 0.2491 - val_loss: 0.2472
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - loss: 0.2474 - val_loss: 0.2447
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - loss: 0.2451 - val_loss: 0.2413
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step - loss: 0.2419 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Crédit_Populaire_dAlgérie

In [18]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Crédit_Populaire_dAlgérie.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    # # Skip empty files or corrupted embeddings
    # if embeddings.size == 0:
    #     print(f"Skipping {filename} (empty)")
    #     continue

# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    # 'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])
# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=['Normal', 'Anomaly']))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Crédit_Populaire_dAlgérie.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.2516 - val_loss: 0.2511
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step - loss: 0.2511 - val_loss: 0.2505
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step - loss: 0.2505 - val_loss: 0.2496
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - loss: 0.2496 - val_loss: 0.2482
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step - loss: 0.2483 - val_loss: 0.2462
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step - loss: 0.2464 - val_loss: 0.2433
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step - los

emploitic

In [19]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Emploitic.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    # # Skip empty files or corrupted embeddings
    # if embeddings.size == 0:
    #     print(f"Skipping {filename} (empty)")
    #     continue

# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    # 'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])
# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=['Normal', 'Anomaly']))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Emploitic.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2515 - val_loss: 0.2509
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - loss: 0.2509 - val_loss: 0.2500
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - loss: 0.2501 - val_loss: 0.2488
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - loss: 0.2489 - val_loss: 0.2469
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step - loss: 0.2470 - val_loss: 0.2444
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step - loss: 0.2444 - val_loss: 0.2408
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - loss: 0.2409 - val_

icosnet

In [20]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_ICOSNET.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    # # Skip empty files or corrupted embeddings
    # if embeddings.size == 0:
    #     print(f"Skipping {filename} (empty)")
    #     continue

# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    # 'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])
# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=['Normal', 'Anomaly']))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_ICOSNET.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2514 - val_loss: 0.2509
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step - loss: 0.2509 - val_loss: 0.2502
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step - loss: 0.2502 - val_loss: 0.2491
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step - loss: 0.2491 - val_loss: 0.2477
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step - loss: 0.2474 - val_loss: 0.2455
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - loss: 0.2451 - val_loss: 0.2426
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step - loss: 0.2418 - val_lo

ooredoo

In [21]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Ooredoo.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    # # Skip empty files or corrupted embeddings
    # if embeddings.size == 0:
    #     print(f"Skipping {filename} (empty)")
    #     continue

# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    # 'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])
# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=['Normal', 'Anomaly']))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Ooredoo.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2516 - val_loss: 0.2511
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - loss: 0.2511 - val_loss: 0.2505
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - loss: 0.2505 - val_loss: 0.2495
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step - loss: 0.2495 - val_loss: 0.2480
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - loss: 0.2481 - val_loss: 0.2459
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step - loss: 0.2460 - val_loss: 0.2430
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - loss: 0.2431 - val_lo

oudkniss

In [9]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Ouedkniss.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    # # Skip empty files or corrupted embeddings
    # if embeddings.size == 0:
    #     print(f"Skipping {filename} (empty)")
    #     continue

# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])
# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=['Normal', 'Anomaly']))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Ouedkniss.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.2516 - val_loss: 0.2510
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step - loss: 0.2511 - val_loss: 0.2503
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.2505 - val_loss: 0.2494
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step - loss: 0.2497 - val_loss: 0.2482
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step - loss: 0.2484 - val_loss: 0.2464
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - loss: 0.2467 - val_loss: 0.2439
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - loss: 0.2442 - val_

Sonelgaz

In [24]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Sonelgaz.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

  
# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Append additional anomalies to test set
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])

# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print("Classification Report:")
print(classification_report(y_test, y_pred_test, target_names=['Normal', 'Anomaly']))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Sonelgaz.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2516 - val_loss: 0.2511
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - loss: 0.2511 - val_loss: 0.2505
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step - loss: 0.2504 - val_loss: 0.2495
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - loss: 0.2495 - val_loss: 0.2482
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - loss: 0.2481 - val_loss: 0.2463
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step - loss: 0.2462 - val_loss: 0.2437
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - loss: 0.2435 - val_l

sonatrach

In [8]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Sonatrach.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    # # Skip empty files or corrupted embeddings
    # if embeddings.size == 0:
    #     print(f"Skipping {filename} (empty)")
    #     continue

# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)


# # Append additional anomalies to test set
# X_test = np.concatenate([X_test-anomaly_embeddings, anomaly_embeddings_2], axis=0)
# y_test = np.concatenate([y_test, np.ones(len(anomaly_embeddings_2))], axis=0)
X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])
# Keep only normal samples in test set


# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score = ", f1_score(y_test, y_pred_test))
print(f"recall_score Score = ", recall_score(y_test, y_pred_test))
print(f"precision_score Score = ", precision_score(y_test, y_pred_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=['Normal', 'Anomaly']))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Sonatrach.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2516 - val_loss: 0.2510
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step - loss: 0.2510 - val_loss: 0.2503
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step - loss: 0.2503 - val_loss: 0.2493
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step - loss: 0.2492 - val_loss: 0.2478
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - loss: 0.2476 - val_loss: 0.2457
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - loss: 0.2453 - val_loss: 0.2427
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step - loss: 0.2420 - val_

In [26]:
y_test.sum(), len(y_test)-y_test.sum()

(11.0, 10.0)

In [3]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
tn, fp, fn, tp

(1, 10, 5, 28)

yassir 

In [22]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, load_model
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Path to folder containing embeddings
folder_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2'

# Filter for .npy files
embedding_file ='C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Yassir.npy'

# for filename in embedding_files:
filename_only = os.path.basename(embedding_file)  # Gives: all_docs_v2_test_Sonelgaz.npy
model_name = f"model_{filename_only.replace('.npy', '')}.keras"
model_path = os.path.join(folder_path, model_name)

print(f"Processing {embedding_file}...")
    
    # Load embeddings
embeddings = np.load(embedding_file)

    
# Split embeddings: 80% for training, 20% for normal test
x_train, x_test_normal = train_test_split(embeddings, test_size=0.2, random_state=42)

# ========== MODEL SETUP ==========
input_dim = x_train.shape[1]
encoder_input = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(encoder_input)
encoded = Dense(32, activation='relu')(x)
x = Dense(256, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(x)
autoencoder = Model(encoder_input, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# ========== TRAIN MODEL IF NOT EXIST ==========

print("Training new autoencoder...")
autoencoder.fit(
        x_train, x_train,
        epochs=50,
        batch_size=256,
        shuffle=True,
        validation_split=0.1,
        verbose=1
    )
autoencoder.save(model_path)
print(f"Saved model  to {model_path}")

# ========== ANOMALY EMBEDDINGS ==========
anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy',
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonelgaz.npy',
    'all_docs_v2_test_Sonatrach.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(os.path.join(folder_path, src))[0:1] for src in anomaly_sources
], axis=0)

anomaly_embeddings_2 = np.concatenate([
    np.load(os.path.join(folder_path, src))[1:2] for src in anomaly_sources
], axis=0)

# ========== TEST SET CONSTRUCTION ==========
X_tmp = np.concatenate([x_test_normal, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(x_test_normal)),  # normal
    np.ones(len(anomaly_embeddings))  # anomaly
])

# Split into validation and test (50-50)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

X_test_normal_only = X_test[y_test == 0]
y_test_normal_only = y_test[y_test == 0]  

# Step 4: Append new anomalies to test set
X_test = np.concatenate([X_test_normal_only, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([
    y_test_normal_only,                      # normal (0s)
    np.ones(len(anomaly_embeddings_2))       # new anomalies (1s)
])
# ========== PREDICTIONS ==========
recon_val = autoencoder.predict(X_val)
recon_test = autoencoder.predict(X_test)

mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# ========== THRESHOLD SELECTION ==========
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    preds = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, preds))

best_thresh = sorted_mse[np.argmax(f1_scores)]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {max(f1_scores):.4f}")

# ========== FINAL EVALUATION ==========
y_pred_test = (mse_test > best_thresh).astype(int)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1_score(y_test, y_pred_test):.4f}")
print(f"Precision  : {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall     : {recall_score(y_test, y_pred_test):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))


Processing C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/documents and embeddings/all_docs_v2_test_Yassir.npy...
Training new autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2515 - val_loss: 0.2510
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.2509 - val_loss: 0.2502
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - loss: 0.2501 - val_loss: 0.2492
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - loss: 0.2488 - val_loss: 0.2477
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step - loss: 0.2470 - val_loss: 0.2457
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - loss: 0.2445 - val_loss: 0.2429
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - loss: 0.2411 - val_los

In [19]:
print(len(x_test_normal))
print(len(anomaly_embeddings))
print(len(anomaly_embeddings_2))
print(len(x_train))

13
2
2
51


In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt

# Path to folder containing embeddings
folder_path = '/content/drive/MyDrive/documents'

# Filter for .npy files
embedding_files = [f for f in os.listdir(folder_path) if f.endswith('.npy')]

for filename in embedding_files:
    npy_path = os.path.join(folder_path, filename)
    model_name = f"model_{filename.replace('.npy', '')}.weights.h5"
    model_path = os.path.join(folder_path, model_name)

    print(f"Processing {filename}...")

    # Load embeddings
    embeddings = np.load(npy_path)

    # Skip empty files or corrupted embeddings
    if embeddings.size == 0:
        print(f"Skipping {filename} (empty)")
        continue

    with tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'):
        x_train = embeddings
        input_dim = x_train.shape[1]

        # Build autoencoder
        encoder_input = Input(shape=(input_dim,))
        x = Dense(256, activation='relu')(encoder_input)
        encoded = Dense(32, activation='relu')(x)
        x = Dense(256, activation='relu')(encoded)
        decoded = Dense(input_dim, activation='sigmoid')(x)

        autoencoder = Model(encoder_input, decoded)
        autoencoder.compile(optimizer='adam', loss='mse')

        # Train
        autoencoder.fit(
            x_train, x_train,
            epochs=50,
            batch_size=256,
            shuffle=True,
            validation_split=0.1,
            verbose=1
        )


        # Save model
        autoencoder.save_weights(model_path, overwrite=True)
        print(f"Saved model to {model_path}")


# Load model
model_path = 'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2/Sonelgaz.keras'

model = tf.keras.models.load_model(model_path)

# Load embeddings
normal_embeddings = np.load('C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2/all_docs_v2_test_Sonelgaz.npy')

anomaly_sources = [
    'all_docs_v2_test_AADL.npy',
    'all_docs_v2_test_Air_Algérie.npy',
    'all_docs_v2_test_Algérie_Poste.npy',
    'all_docs_v2_test_Algérie_Télécom.npy', 
    'all_docs_v2_test_Crédit_Populaire_dAlgérie.npy',
    'all_docs_v2_test_Emploitic.npy',
    'all_docs_v2_test_ICOSNET.npy',
    'all_docs_v2_test_Ooredoo.npy',
    'all_docs_v2_test_Ouedkniss.npy',
    'all_docs_v2_test_Sonatrach.npy',
    'all_docs_v2_test_Yassir.npy'
]

anomaly_embeddings = np.concatenate([
    np.load(f'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2/{src}')[0:2]
    for src in anomaly_sources
], axis=0)

# Create labels
X = np.concatenate([normal_embeddings, anomaly_embeddings], axis=0)
y = np.zeros(len(normal_embeddings))

# Train/val/test split
# Step 1: Split normal embeddings
_, X_tmp = train_test_split(normal_embeddings, test_size=0.2, random_state=42)

# Step 2: Add anomalies for evaluation
X_tmp = np.concatenate([X_tmp, anomaly_embeddings], axis=0)
y_tmp = np.concatenate([
    np.zeros(len(X_tmp) - len(anomaly_embeddings)),np.ones(len(anomaly_embeddings))], axis=0)


X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=42
)

# Instead of slicing out normal samples, just append additional anomalies
anomaly_embeddings_2 = np.concatenate([
    np.load(f'C:/Users/rammo/Desktop/Data sensitivity discovery/Anomaly Detection in docs/Scaping dataset/scribd_test_2/{src}')[2:4]
    for src in anomaly_sources
], axis=0)

X_test = np.concatenate([X_test, anomaly_embeddings_2], axis=0)
y_test = np.concatenate([y_test, np.ones(len(anomaly_embeddings_2))], axis=0)


# Predictions
recon_val = model.predict(X_val)
recon_test = model.predict(X_test)
mse_val = np.mean(np.square(X_val - recon_val), axis=1)
mse_test = np.mean(np.square(X_test - recon_test), axis=1)

# Find optimal threshold
sorted_mse = np.sort(mse_val)
f1_scores = []
for thresh in sorted_mse:
    y_pred_val = (mse_val > thresh).astype(int)
    f1_scores.append(f1_score(y_val, y_pred_val))

best_idx = np.argmax(f1_scores)
best_thresh = sorted_mse[best_idx]
print(f"Best threshold: {best_thresh:.6f}")
print(f"Best F1 score (val): {f1_scores[best_idx]:.4f}")

# Test evaluation
y_pred_test = (mse_test > best_thresh).astype(int)

f1 = f1_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)

print("\nTest Evaluation:")
print(f"F1 Score   : {f1:.4f}")
print(f"Precision  : {precision:.4f}")
print(f"Recall     : {recall:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))
