## Import

In [1]:
!pip install sentence-transformers datasets



In [2]:
from sentence_transformers import SentenceTransformer
from pathlib import Path
from datasets import load_from_disk
import pandas as pd
import numpy as np
import json
import time
from tqdm import tqdm

## Dataset prepare

In [6]:
# df = pd.read_json("../data/merged_email_dataset.json")
df = pd.read_json("hf://datasets/0tt00t/PI-EmailGuard/merged_email_dataset.json")

In [None]:
output_column = df["output"].tolist()
sentences = output_column

## Embedding

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
batch_size = 1000
n = len(sentences)
emb_list = []
encode_time = 0.0

In [None]:
with tqdm(total=n, desc="Encoding", unit="items") as pbar:
    for i in range(0, n, batch_size):
        batch = sentences[i : i + batch_size]
        t0 = time.time()
        batch_emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        t1 = time.time()
        encode_time += (t1 - t0)
        emb_list.append(batch_emb)
        pbar.update(len(batch))

if len(emb_list) > 0:
    embeddings = np.vstack(emb_list)
else:
    embeddings = np.empty((0, model.get_sentence_embedding_dimension()))

print(f"Time taken to generate embeddings (encode only): {encode_time:.2f} seconds")

In [None]:
vector = np.array(embeddings)

np.save('vectors.npy', vector)


## Load Vector for Embedding

In [None]:
loaded_vector = np.load('vectors.npy')

In [5]:
from huggingface_hub import hf_hub_download

# Download the file
file_path = hf_hub_download(repo_id="0tt00t/PI-EmailGuard", filename="vectors.npy", repo_type="dataset")

# Load the vector file using numpy
loaded_vector = np.load(file_path)

print("Vector file loaded successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vectors.npy:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

Vector file loaded successfully.


## Prepare data for modeling

In [7]:
y = df['is_injected']
X = loaded_vector

## Split data

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train a random forest classifier

In [12]:
pip install tqdm tqdm-joblib

Collecting tqdm-joblib
  Downloading tqdm_joblib-0.0.5-py3-none-any.whl.metadata (1.4 kB)
Downloading tqdm_joblib-0.0.5-py3-none-any.whl (1.9 kB)
Installing collected packages: tqdm-joblib
Successfully installed tqdm-joblib-0.0.5


In [13]:
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

model = RandomForestClassifier(
    n_estimators=300,
    n_jobs=-1,
    random_state=42
)

with tqdm_joblib(tqdm(desc="Training RF", total=model.n_estimators)) as progress_bar:
    model.fit(X_train, y_train)


Training RF:   0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [15]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
Confusion Matrix:
[[106974      0]
 [     3  92492]]


## Train an SVM classifier

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

svm = SVC(random_state=42)

cv = 5
with tqdm_joblib(tqdm(desc="CV (SVC)", total=cv)):
    scores = cross_val_score(svm, X_train, y_train, cv=cv, n_jobs=-1, scoring="f1")


CV (SVC):   0%|          | 0/5 [00:00<?, ?it/s][A

  0%|          | 0/5 [00:00<?, ?it/s]

## Evaluate the svm model


In [22]:
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)

print(f"SVM Accuracy: {accuracy_svm:.4f}")
print(f"SVM Precision: {precision_svm:.4f}")
print(f"SVM Recall: {recall_svm:.4f}")
print(f"SVM F1-score: {f1_svm:.4f}")
print(f"SVM Confusion Matrix:\n{conf_matrix_svm}")

SVM Accuracy: 1.0000
SVM Precision: 1.0000
SVM Recall: 1.0000
SVM F1-score: 1.0000
SVM Confusion Matrix:
[[106974      0]
 [     1  92494]]


## Save Models

In [23]:
import joblib

# Save the Random Forest model
joblib.dump(model, 'random_forest_model.pkl')

# Save the SVM model
joblib.dump(svm, 'svm_model.pkl')

print("Models saved successfully.")

CV (SVC):   0%|          | 0/5 [09:34<?, ?it/s]

Models saved successfully.



