# Approach 1: Dependency Parsing -> sBERT

In [None]:

!pip install --quiet transformers torch pandas numpy tqdm

# --- Imports ---
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from tqdm import tqdm
from google.colab import drive
import os

drive.mount('/content/drive', force_remount=True)
BASE_DIR   = "/content/drive/MyDrive/5640 Capstone/Dependency Parsing"
EVENT_PATH = os.path.join(BASE_DIR, "article_event_templates.csv")
OUT_NPY    = os.path.join(BASE_DIR, "hybrid_embeddings.npy")
OUT_CSV    = os.path.join(BASE_DIR, "hybrid_embeddings.csv")

MODEL_NAME = "intfloat/e5-base-v2"
BATCH_SIZE = 16
os.makedirs(BASE_DIR, exist_ok=True)


df = pd.read_csv(EVENT_PATH)
assert {"article_id", "event_text"}.issubset(df.columns), "Missing required columns"
df["event_text"] = df["event_text"].fillna("").astype(str).str.strip()
texts = df["event_text"].tolist()



tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()


def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    masked = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return masked.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def embed_texts(texts, tokenizer, model, batch_size=16, max_length=512):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch_texts = texts[i:i + batch_size]
        batch_embs = []

        for t in batch_texts:
            prefixed = "query: " + t
            tokenized = tokenizer(prefixed, add_special_tokens=False)["input_ids"]

            #this is to catch the couple tensors >512
            if len(tokenized) > max_length:
                chunks = [tokenized[j:j + max_length] for j in range(0, len(tokenized), max_length)]
                chunk_texts = [tokenizer.decode(c) for c in chunks]
            else:
                chunk_texts = [prefixed]

            encoded = tokenizer(
                chunk_texts,
                padding=True,
                truncation=True,  #no data lost bc we prechunked in the if statement
                max_length=max_length,
                return_tensors="pt"
            )
            with torch.no_grad():
                outputs = model(**encoded)
            emb = average_pool(outputs.last_hidden_state, encoded["attention_mask"])
            emb = F.normalize(emb, p=2, dim=1)

            #average all chunk embeddings
            doc_emb = emb.mean(dim=0, keepdim=True)
            batch_embs.append(doc_emb)

        batch_embs = torch.cat(batch_embs, dim=0)
        embeddings.append(batch_embs)

    embeddings = torch.cat(embeddings, dim=0).cpu().numpy()
    print("Embedding matrix shape:", embeddings.shape)
    return embeddings



embs = embed_texts(texts, tokenizer, model, batch_size=BATCH_SIZE)



np.save(OUT_NPY, embs)

emb_cols = [f"v{i}" for i in range(embs.shape[1])]
emb_df = pd.DataFrame(embs, columns=emb_cols)
merged = pd.concat([df[["article_id", "event_text"]].reset_index(drop=True), emb_df], axis=1)
merged.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

print(f"\n Saved NumPy → {OUT_NPY}")
print(f"\n Saved CSV → {OUT_CSV}  ({len(merged)} rows, {embs.shape[1]}-D)")
print(merged.head(3))


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Embedding:   1%|          | 8/818 [01:21<2:17:33, 10.19s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
Embedding: 100%|██████████| 818/818 [1:33:29<00:00,  6.86s/it]


Embedding matrix shape: (13075, 768)

 Saved NumPy → /content/drive/MyDrive/5640 Capstone/Dependency Parsing/hybrid_embeddings.npy

 Saved CSV → /content/drive/MyDrive/5640 Capstone/Dependency Parsing/hybrid_embeddings.csv  (13075 rows, 768-D)
   article_id                                         event_text        v0  \
0           0  attain semblance ; be desire ; be group ; be s... -0.003964   
1           1  access license ; ai move ; build team ; certif... -0.004098   
2           2  2020 be ; actor cripple time ; actor move void... -0.029652   

         v1        v2        v3        v4        v5        v6        v7  ...  \
0 -0.024365 -0.033322 -0.016421  0.050739  0.007004  0.066727  0.041740  ...   
1 -0.006827 -0.016355  0.029361  0.061504  0.006962  0.023147  0.045593  ...   
2 -0.023817 -0.034449  0.008106  0.055640 -0.004190  0.040200  0.018770  ...   

       v758      v759      v760      v761      v762      v763      v764  \
0  0.036360 -0.029659  0.044942 -0.049500  0.02

Mounted at /content/drive
Loading event templates...
Loaded 13075 dependency-parsed articles
Loading sentence-transformers/all-mpnet-base-v2 ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding with chunking ...


  1%|          | 129/13075 [02:36<5:44:56,  1.60s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 5541/13075 [58:48<1:29:33,  1.40it/s]

In [None]:


# ---- CONFIG ----
OUT_DIR    = "/content/drive/MyDrive/5640 Capstone/Dependency Parsing"
drive.mount("/content/drive", force_remount=True)
EMB_PATH = "/content/drive/MyDrive/5640 Capstone/Dependency Parsing/hybrid_embeddings.npy"
META_CSV = "/content/drive/MyDrive/5640 Capstone/Dependency Parsing/hybrid_embeddings.csv"
OUT_CSV  = "/content/drive/MyDrive/5640 Capstone/Embeddings/hybrid_clustered.csv"
MIN_CLUSTER_SIZE = 5
# ----------------

print("Loading embeddings...")
embs = np.load(EMB_PATH)
df = pd.read_csv(META_CSV)
assert len(df) == len(embs), "Mismatch between embeddings and metadata length"

print("Running HDBSCAN ...")
clusterer = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE,
                            metric='cosine',
                            cluster_selection_epsilon = 0.0,
                            cluster_selection_method='eom')
labels = clusterer.fit_predict(embs)
df["cluster_label"] = labels

# Save results
os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)
df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print(f"Saved clustered data → {OUT_CSV}")

# Basic stats
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"Clusters found: {n_clusters}")
print("Noise points:", sum(labels == -1))

# Optional silhouette
valid = labels != -1
if len(set(labels[valid])) > 1:
    sil = silhouette_score(embs[valid], labels[valid])
    print(f"Silhouette score (excluding noise): {sil:.3f}")
else:
    print("Silhouette not computed (only one cluster)")


  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


Mounted at /content/drive
Loading embeddings...
Running HDBSCAN ...




✅ Saved clustered data → /content/drive/MyDrive/5640 Capstone/Embeddings/hybrid_clustered.csv
Clusters found: 50
Noise points: 7168
Silhouette score (excluding noise): 0.191


Loading embeddings ...


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/5640 Capstone/Dependency Parsing/hybrid_embeddings.npy'