## 1. Imports and data loading

- Load the raw train/test JSONL files.
- Define small helper functions for reading data and building a context string for each mention.
- Initialize the sentence-transformer model that will be used to embed these contexts.

In [None]:
import json
import re
import numpy as np
import pandas as pd
from tqdm import tqdm 
from sentence_transformers import SentenceTransformer 
from sklearn.preprocessing import normalize 
import faiss  
from sklearn.cluster import HDBSCAN 

TRAIN_DATA_PATH = "train_data.jsonl"
TEST_DATA_PATH = "test_data.jsonl"
TRAIN_LABELS_PATH = "train_labels.json"


def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]


def create_context(row):
    mention = str(row.get("mention", "")).lower()
    # Duplicate the mention to give it extra weight, then add type and related mentions
    parts = [mention, mention, str(row.get("type", ""))]
    for rel in row.get("relations", []):
        parts.append(str(rel.get("mention", "")))
    return " ".join(parts)


# Load sentence-transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Load train / test examples from disk
train_data = load_jsonl(TRAIN_DATA_PATH)
test_data = load_jsonl(TEST_DATA_PATH)

# Load gold clusters for the train set
with open(TRAIN_LABELS_PATH, "r", encoding="utf-8") as f:
    train_labels = json.load(f)

# Convert to data frame
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


## 2. Build KB centroids from train clusters

- Create a knowledge base of entities from the training data
- Encode each training mention into an embedding using its textual context.
- For every gold cluster in `train_labels`, average the embeddings of its members to obtain a centroid.
- Collect all centroids into `kb_vectors`, which we will query later when linking test mentions.

Each centroid represents one gold entity (cluster) in embedding space.

In [None]:
# 2. Build KB centroids from train clusters

# Build context strings for each train mention
df_train["context"] = df_train.apply(create_context, axis=1)

# Encode all train contexts and L2-normalize embeddings
train_embs = normalize(model.encode(df_train["context"].tolist(), show_progress_bar=True))
id_to_emb = dict(zip(df_train["mention_id"], train_embs))

# Compute one centroid embedding per cluster from test dataset
kb_vectors = []
for group in train_labels:
    group_embs = [id_to_emb[m_id] for m_id in group if m_id in id_to_emb]
    if group_embs:
        centroid = np.mean(group_embs, axis=0)
        kb_vectors.append(centroid / np.linalg.norm(centroid))

kb_vectors = np.array(kb_vectors)
print(f"Gold standard {len(kb_vectors)} clusters created")

Batches:   0%|          | 0/90 [00:00<?, ?it/s]

Gold standard 699 clusters created


## 3. Match test mentions to KB centroids (FAISS + exact string match)

Try to attach as many **test** mentions as possible directly to existing KB clusters
- Encode test mentions into embeddings.
- Use a FAISS inner-product index over `kb_vectors` to retrieve the most similar centroid for each test mention.
- Combine semantic similarity (embedding score) with an exact surface-form lookup:
  - If the test surface string exactly occurs in the train KB, we **always** attach to that cluster.
  - Otherwise we attach based on similarity if the score is above `THRESHOLD`.
  - All remaining mentions are marked as unmatched and will be clustered later with HDBSCAN.

In [None]:
# 3. Match test mentions to KB centroids (FAISS + exact string match)

# Build context strings and embeddings for test mentions
df_test["context"] = df_test.apply(create_context, axis=1)
test_embs = normalize(model.encode(df_test["context"].tolist(), show_progress_bar=True))

THRESHOLD = 0.7

# Build FAISS index over KB centroids (cosine similarity via inner product on normalized vectors)
dim = kb_vectors.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(kb_vectors.astype("float32"))

print(f"Searching KB index for {len(test_embs)} mentions...")
scores, indices = index.search(test_embs.astype("float32"), 1)
best_match_indices = indices.flatten()
max_scores = scores.flatten()

# assignments[i] 
assignments = []
# unmatched_indices 
unmatched_indices = []

# KB cluster mapping from train data
id_to_text_map = {row["mention_id"]: row["mention"] for row in train_data}
string_to_cluster = {}
for cluster_idx, m_ids in enumerate(train_labels):
    for m_id in m_ids:
        text = id_to_text_map.get(m_id, "").lower().strip()
        if text:
            string_to_cluster[text] = cluster_idx

# Lowercased canonical names for test mentions
test_mentions = df_test["mention"].str.lower().str.strip().values

for i in range(len(test_embs)):
    current_text = test_mentions[i]
    best_match_idx = best_match_indices[i]
    max_score = max_scores[i]

    # 1) Prefer exact surface-form match if present in KB
    if current_text in string_to_cluster:
        assignments.append(string_to_cluster[current_text])

    # 2) Otherwise, attach by high semantic similarity
    elif max_score >= THRESHOLD:
        assignments.append(best_match_idx)

    # 3) Low similarity or unseen name: mark as unmatched
    else:
        assignments.append(-1)
        unmatched_indices.append(i)

print(f"Matched {len(assignments) - len(unmatched_indices)} mentions to existing clusters.")
print(f"{len(unmatched_indices)} mentions marked as unmatched")

## 4. Create extra clusters for unmatched mentions

- Normalize mention surface forms and build an abbreviation map (short form to long form).
- Derive a `canonical_name` for each mention using this map.
- Run HDBSCAN on the unmatched embeddings, grouped by entity type and optionally by first letter to keep blocks computable.
- Merge the resulting HDBSCAN clusters with the KB-linked mentions into a final clustering of the test set.

In [None]:
# 4.1 Normalize names and build abbreviation map

def clean_name(name):
    if not isinstance(name, str): return ""
    return re.sub(r'[^a-z0-9]', '', name.lower())


print("Mapping abbreviations...")
df_test['clean_name'] = df_test['mention'].str.lower().str.replace(r'[^a-z0-9]', '', regex=True)

abbrev_map = {}
rel_mask = df_test['relations'].map(len) > 0
for row in tqdm(df_test[rel_mask][['clean_name', 'relations']].to_dict('records'), desc="Mapping Abbrevs"):
    long_n = row['clean_name']
    for rel in row['relations']:
        if rel.get('type') == 'Abbreviation':
            short_n = clean_name(rel.get('mention', '')) 
            if long_n and short_n:
                abbrev_map[short_n] = long_n

df_test['canonical_name'] = df_test['clean_name'].map(abbrev_map).fillna(df_test['clean_name'])

m_ids = df_test['mention_id'].values
c_names = df_test['canonical_name'].values

cluster_labels_final = np.full(len(unmatched_indices), -1, dtype=int)


Cleaning names and mapping abbreviations...


Mapping Abbrevs: 100%|██████████| 122031/122031 [00:00<00:00, 1180268.21it/s]


In [66]:

# 4.2 Cluster unmatched mentions with HDBSCAN
if unmatched_indices:
    unmatched_embs = test_embs[unmatched_indices].astype('float32') 
    unmatched_types = df_test.iloc[unmatched_indices]['type'].fillna('UNKNOWN').values
    unmatched_names = [c_names[idx] for idx in unmatched_indices] 
    
    next_cluster_id = 0
    unique_types = np.unique(unmatched_types)
    
    print(f"Clustering {len(unmatched_indices)} mentions...")
    for ent_type in tqdm(unique_types, desc="HDBSCAN Blocks"):
        type_mask = (unmatched_types == ent_type)
        type_indices = np.where(type_mask)[0]
        
        if len(type_indices) < 2: continue

        if len(type_indices) > 20000:
            first_letters = np.array([str(unmatched_names[i])[0] if unmatched_names[i] else '#' for i in type_indices])
            sub_units = np.unique(first_letters)
        else:
            sub_units = ['ALL']

        for sub in sub_units:
            if sub == 'ALL':
                sub_idx = type_indices
            else:
                sub_mask = (first_letters == sub)
                sub_idx = type_indices[sub_mask]

            if len(sub_idx) < 2: continue

            block_model = HDBSCAN(
                min_cluster_size=2, 
                min_samples=1, 
                metric='euclidean', 
                cluster_selection_epsilon=0.15,
                n_jobs=-1  
            )
            block_labels = block_model.fit_predict(unmatched_embs[sub_idx])
            
            for i, lbl in enumerate(block_labels):
                if lbl != -1:
                    cluster_labels_final[sub_idx[i]] = lbl + next_cluster_id
            
            if block_labels.max() != -1:
                next_cluster_id += block_labels.max() + 1


final_clusters = {}
name_to_cluster_id = {}
hdb_to_cluster_id = {}
next_singleton_id = 0

print("Finalizing clusters...")


Clustering 115711 mentions...


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
HDBSCAN Blocks: 100%|██████████| 28/28 [01:38<00:00,  3.53s/it]

Finalizing clusters...





In [None]:

# 4.3 Merge KB assignments and HDBSCAN clusters into final labels
for i in tqdm(range(len(assignments)), desc="Pass 1: KB"):
    kb_idx = assignments[i]
    if kb_idx != -1:
        target_cluster = f"kb_{kb_idx}"
        name = c_names[i]
        final_clusters.setdefault(target_cluster, []).append(m_ids[i])
        name_to_cluster_id[name] = target_cluster

if unmatched_indices:
    u_names = [c_names[idx] for idx in unmatched_indices]
    u_ids = [m_ids[idx] for idx in unmatched_indices]

    for i in tqdm(range(len(unmatched_indices)), desc="Pass 2: New"):
        label = cluster_labels_final[i]
        name = u_names[i]
        m_id = u_ids[i]
        
        if name in name_to_cluster_id:
            target_cluster = name_to_cluster_id[name]
        elif label != -1 and label in hdb_to_cluster_id:
            target_cluster = hdb_to_cluster_id[label]
        elif label != -1:
            target_cluster = f"new_{label}"
            hdb_to_cluster_id[label] = target_cluster
        else:
            target_cluster = f"singleton_{next_singleton_id}"
            next_singleton_id += 1
        
        name_to_cluster_id[name] = target_cluster
        if label != -1: hdb_to_cluster_id[label] = target_cluster
        final_clusters.setdefault(target_cluster, []).append(m_id)

test_labels_output = list(final_clusters.values())
with open("test_labels.json", "w") as f:
    json.dump(test_labels_output, f, indent=2)

print(f"Total Clusters: {len(test_labels_output)}")

Pass 1: KB: 100%|██████████| 219950/219950 [00:00<00:00, 1784532.29it/s]
Pass 2: New: 100%|██████████| 115711/115711 [00:00<00:00, 827126.30it/s]


Success! Total Clusters: 33002
