This notebook only contains code for the SecureBERT hybrid embeddings and NOT the clustering.

In [None]:
import re # parsing for converting strings of tensors to arrays
import pandas as pd
%pip install torch
import torch
import torch.nn.functional as F
from torch import Tensor
import transformers
from transformers import RobertaTokenizerFast, RobertaModel
import numpy as np
# Initalize model
model_name = "ehsanaghaei/SecureBERT"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model= RobertaModel.from_pretrained(model_name, add_pooling_layer=False).eval()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
df = pd.read_csv("article_event_templates.csv", encoding="latin1")
print(df.head())

   article_id                                         event_text
0           0  attain semblance ; be desire ; be group ; be s...
1           1  access license ; ai move ; build team ; certif...
2           2  2020 be ; actor cripple time ; actor move void...
3           3  adapt threat ; anticipate detect ; anticipate ...
4           4  apply patch ; be dependency ; charge waste ; c...


In [None]:
# split into lists (strip optional braces and surrounding spaces)
df["chunks"] = df["event_text"].str.strip("{}").str.split(r"\s*;\s*", regex=True)

# explode and add within-row ids (based on the original row index)
out = (
    df[["article_id", "chunks"]]
      .explode("chunks")
      .rename(columns={"chunks": "split_text"})
      .dropna()
      .assign(within_row_id=lambda d: d.groupby(level=0).cumcount())  # 0,1,2,...
      .reset_index(names="orig_row")  # original row number
)

print(out.head())

#Creating a streamlined data frame
df = out
df = df.rename(columns={"within_row_id": "triple_id"})
df = df.drop(columns=["orig_row"])

print(df.head())

   orig_row  article_id        split_text  within_row_id
0         0           0  attain semblance              0
1         0           0         be desire              1
2         0           0          be group              2
3         0           0         be source              3
4         0           0        benefit be              4
   article_id        split_text  triple_id
0           0  attain semblance          0
1           0         be desire          1
2           0          be group          2
3           0         be source          3
4           0        benefit be          4


# Loading in SecureBERT and then applying embedding to every text

In [None]:



#This function will embed the text using SecureBERT

## Commented out original embedding function:
# def SecureBERT_embed(article_text):
      # This puts the text into tokens wihtout padding or truncation! So chunk sizes need to be correct beforehand.
 # batch = tokenizer(article_text, return_tensors="pt", padding=False, truncation=False)
 # with torch.no_grad():
 #     out = model(**batch)  # last_hidden_state: [1, T, H]. This is
 # last_hidden = out.last_hidden_state         # list is per-token contextual embeddings
#  sent_emb = last_hidden.mean(dim=1)          # averaging all token embeddings...
        # Did NOT normalize here since Anya does it in the HBDSCAN code.
 # return(sent_emb)

def SecureBERT_embed(article_text):
    batch = tokenizer(article_text, return_tensors="pt", padding=False, truncation=False)
    with torch.no_grad():
        out = model(**batch, output_hidden_states=True)

    # out.hidden_states is a tuple: [embeddings, layer1, ..., layerN]
    hidden_states = out.hidden_states                # len = N_layers + 1
    last4 = hidden_states[-4:]                       # take last 4 layers
    # Average across the 4 layers -> [1, T, H]
    token_reps = torch.stack(last4, dim=0).mean(dim=0)

    # Mean-pool across tokens with attention mask
    attn = batch["attention_mask"].unsqueeze(-1).to(token_reps.dtype)  # [1, T, 1]
    masked = token_reps * attn                                         # zero out pads (if any)
    sent_emb = masked.sum(dim=1) / attn.sum(dim=1).clamp(min=1e-9)     # [1, H]
          # No normalization

    return sent_emb


print(f"Here is the final shape of the embedding object: {SecureBERT_embed(text).shape}")

print("Here is an example of the embedding object in full")
SecureBERT_embed(text)

Here is the final shape of the embedding object: torch.Size([1, 768])
Here is an example of the embedding object in full


tensor([[-1.5009e-02, -2.7689e-02, -4.9516e-03, -6.9602e-03,  3.7658e-02,
          4.5381e-02, -1.4163e-02,  9.1673e-03, -6.7716e-03, -1.5828e-02,
         -1.7022e-02,  2.4589e-02, -8.7878e-03,  1.5037e-02,  2.7817e-02,
          7.2541e-02,  2.5042e-02, -1.4167e-02,  1.1876e-02,  9.4069e-02,
         -4.3695e-02,  3.2139e-02, -1.5925e-02, -7.8477e-03,  2.3173e-02,
          3.3872e-02, -3.3681e-02, -5.1435e-03, -1.4858e-02,  1.7103e-02,
         -1.1382e-02,  7.3876e-03, -1.2634e-02,  1.9167e-02,  2.0603e-02,
          2.6289e-02, -1.6248e-02, -5.4231e-03,  8.8549e-02,  3.7792e-03,
          3.0797e-02, -8.2246e-02,  1.1658e-03, -1.9300e-03, -6.9374e-02,
          1.3297e-02,  3.9672e-03, -2.4561e-02,  7.6257e-03,  5.7704e-03,
         -2.2682e-02,  3.2992e-02, -2.8369e-03,  2.5972e-02, -5.3120e-02,
         -1.0593e-02, -2.3154e-03,  3.7218e-02, -1.1401e-02,  2.8248e-02,
          1.9864e-02,  1.2499e-01, -6.6121e-02,  3.1625e-02,  3.0383e-03,
         -4.1395e-02, -1.5580e-02, -1.

In [None]:
df["triple_embeddings"] = df["split_text"].apply(SecureBERT_embed)
print(df.head())

#safeguard if the average doesn't work
article_securebert_embeddings_no_average_df =  df
article_securebert_embeddings_no_average_df.to_csv("SecureBERT_dependency_embeddings_no_average.csv", index=False)
from google.colab import files
files.download("SecureBERT_dependency_embeddings_no_average.csv")

KeyboardInterrupt: 

In [None]:
# Average over each article to get one embedding.

print(df.head())


def mean_vec(series):
    arrs = [np.asarray(x, dtype=float) for x in series if isinstance(x, np.ndarray)]
    return np.mean(np.vstack(arrs), axis=0)



## Anya added new func to convert str --> tensor embeddings --> numpy arrays to take the mean ##
# Why? triple_embeddings was saved as 'tensor[((# # # #....))]'
# just to be save, go thru cases -->



def parse_embedding(x):
    # If it's already a tensor -> convert to numpy
    if isinstance(x, torch.Tensor):
        return x.detach().cpu().numpy()

    # If it's a numpy array -> return as is
    if isinstance(x, np.ndarray):
        return x

    # If it's a string -> attempt parsing
    if isinstance(x, str):
        # Remove the 'tensor([...])' wrapper
        cleaned = x.strip().replace("tensor(", "").rstrip(")")
        # Convert string of numbers -> numpy
        return np.array(eval(cleaned), dtype=float)

    raise TypeError(f"Unknown embedding type: {type(x)}")


def mean_vec(series):
    arrs = [np.asarray(x, dtype=float) for x in series]
    return np.mean(np.vstack(arrs), axis=0)

df["triple_embeddings"] = df["triple_embeddings"].apply(parse_embedding)

## Anya changed split_articles to triple_embeddings ##
article_securebert_embeddings_df = (df
            # .groupby("article_id")["split_articles"]
            .groupby("article_id")["triple_embeddings"]
            .apply(mean_vec)
            .reset_index(name="article_securebert_mean"))


#check that it is the length of how many articles we have
print(article_securebert_embeddings_df.head())
len(article_securebert_embeddings_df)


In [None]:
#save to csv
article_securebert_embeddings_df.to_csv("SecureBERT_dependency_embeddings.csv", index=False)
from google.colab import files
files.download("SecureBERT_dependency_embeddings.csv")