In [1]:
!pip -q install datasets sentence-transformers faiss-cpu scikit-learn numpy pandas



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, torch
import faiss
from datasets import load_dataset
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score
from collections import defaultdict, deque
import random, math

# ==== EXPERIMENT KNOBS ====
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # embed model
MAX_ROWS    = None   # None if use all row
MIN_CLUSTER_FOR_EVAL = 2  # only consider components with >= 2 nodes when reporting some stats

# FAISS (IVF-IP ~ cosine) params
K_NEIGHBORS = 10
TAU         = 0.85      # cosine threshold for linking neighbors into graph
NPROBE      = 16        # IVF probes: {8,16,32}
# nlist (number of IVF cells) will be set ≈ sqrt(N) automatically

SEED = 42
random.seed(SEED); np.random.seed(SEED)


In [3]:
# GLUE/QQP via HuggingFace. Split has columns: question1, question2, label (1=duplicate,0=not)
ds = load_dataset("glue", "qqp")  # train/validation/test

df = ds["train"].to_pandas()[["question1","question2","label"]].dropna()
df = df.rename(columns={"label":"is_duplicate"})

if MAX_ROWS is not None and MAX_ROWS < len(df):
    df = df.sample(n=MAX_ROWS, random_state=SEED).reset_index(drop=True)

# Map each unique question to an integer ID
questions = pd.Index(pd.unique(pd.concat([df["question1"], df["question2"]], ignore_index=True)))
qid = {q: i for i, q in enumerate(questions)}
N = len(questions)
print(f"Unique questions (nodes): {N}")

# Ground-truth positive edges (dup pairs)
edges_true = set()
for q1, q2, y in df[["question1","question2","is_duplicate"]].itertuples(index=False):
    if y == 1:
        a, b = qid[q1], qid[q2]
        if a != b:
            if a > b: a, b = b, a
            edges_true.add((a,b))
# Build adjacency and connected components for ground truth
adj_true = [[] for _ in range(N)]
for a,b in edges_true:
    adj_true[a].append(b); adj_true[b].append(a)

print(f"edges_true:{len(edges_true)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

qqp/train-00000-of-00001.parquet:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

qqp/validation-00000-of-00001.parquet:   0%|          | 0.00/3.73M [00:00<?, ?B/s]

qqp/test-00000-of-00001.parquet:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/363846 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/40430 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/390965 [00:00<?, ? examples/s]

Unique questions (nodes): 493874
edges_true:134378


In [4]:
questions.shape

(493874,)

In [5]:


def connected_components(adj):
    comp_id = [-1]*len(adj)
    cid = 0
    for s in range(len(adj)):
        if comp_id[s] != -1:
            continue
        dq = deque([s])
        comp_id[s] = cid
        while dq:
            u = dq.popleft()
            for v in adj[u]:
                if comp_id[v] == -1:
                    comp_id[v] = cid
                    dq.append(v)
        cid += 1
    return np.array(comp_id), cid

true_labels, n_true_comps = connected_components(adj_true)
print(f"Ground-truth components: {n_true_comps}")

# Quick size histogram
from collections import Counter
sizes_true = Counter(true_labels.tolist())
hist_true = Counter(sizes_true.values())
print("Ground-truth component size histogram (size:count) [top 10]:",
      dict(sorted(hist_true.items())[:10]))


Ground-truth components: 410520
Ground-truth component size histogram (size:count) [top 10]: {1: 354909, 2: 43986, 3: 6829, 4: 2157, 5: 972, 6: 510, 7: 302, 8: 197, 9: 152, 10: 76}


which mean we have 410520 cluster, and number of clusters h
ave only 1 node quite large (354909)

In [6]:
# Embed with a single model (fixing the embedding isolates the indexing/hash methods)
model = SentenceTransformer(EMBED_MODEL)
texts = list(questions)
X = model.encode(texts, batch_size=256, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
X = X.astype("float32")
# Now dot product == cosine similarity (because normalized)
d = X.shape[1]
print("Embeddings shape:", X.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1930 [00:00<?, ?it/s]

Embeddings shape: (493874, 384)


In [8]:
import json
OUT_DIR       = "/content/qqp_cache"
os.makedirs(OUT_DIR, exist_ok=True)
np.save(os.path.join(OUT_DIR, "X_float32.npy"), X)  # ~ N x d
pd.Series(questions).to_frame("text").to_parquet(os.path.join(OUT_DIR, "questions.parquet"))
# save qid mapping (string->int) only if you need it later
with open(os.path.join(OUT_DIR, "qid.json"), "w", encoding="utf-8") as f:
    json.dump(qid, f, ensure_ascii=False)

# ground-truth edges (two-column int32)
gt = np.array(list(edges_true), dtype=np.int64)
np.save(os.path.join(OUT_DIR, "gt_edges.npy"), gt)

print("Saved files in", OUT_DIR)

Saved files in /content/qqp_cache
