1. Imports + Loading Data

- Import all required libraries.
- Define file paths for the train/test data and labels.
- Implement small helper functions.
- Load the raw JSONL data into Pandas dataframes.

In [None]:

import json
import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import faiss
import hdbscan
from tqdm import tqdm

# Paths to the shared task data
TRAIN_DATA_PATH = "train_data.jsonl"
TEST_DATA_PATH = "test_data.jsonl"
TRAIN_LABELS_PATH = "rain_labels.json"


def load_jsonl(path: str):
    """Load a JSONL file into a list of dicts."""
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]


def create_context(row: pd.Series) -> str:
    """Build a simple textual context for a mention from its fields and relations."""
    mention = str(row.get('mention', '')).lower()
    # Duplicate the mention to give it extra weight, then add type and related mentions
    parts = [mention, mention, str(row.get('type', ''))]
    for rel in row.get('relations', []):
        parts.append(str(rel.get('mention', '')))
    return " ".join(parts)


def clean_name(name: str) -> str:
    """Normalize a surface form: lowercase and remove non-alphanumeric chars."""
    if not isinstance(name, str):
        return ""
    return re.sub(r'[^a-z0-9]', '', name.lower())


# Load sentence-transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load data
train_data = load_jsonl(TRAIN_DATA_PATH)
test_data = load_jsonl(TEST_DATA_PATH)

# Load clustering for the train set
with open(TRAIN_DATA_PATH.replace('train_data.jsonl', 'train_labels.json'), 'r') as f:
    train_labels = json.load(f)

# Convert to data frame
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

2. Centroid Representation

- Build a textual context for each training mention.
- Encode all train contexts with the sentence-transformer model.
- For each gold cluster (from `train_labels`), average member embeddings to get a centroid.
- Collect these centroids into an array of knowledge-base (KB) vectors.

In [None]:
# Build context strings
df_train['context'] = df_train.apply(create_context, axis=1)

# Encode train mention contexts
train_embs = normalize(model.encode(df_train['context'].tolist(), show_progress_bar=True))
# Map mention_id to embedding vector
id_to_emb = dict(zip(df_train['mention_id'], train_embs))

# Compute one centroid  per train cluster
kb_vectors = []
for group in train_labels:
    # Keep only mention_ids that we have embeddings for
    group_embs = [id_to_emb[m_id] for m_id in group if m_id in id_to_emb]
    if group_embs:
        centroid = np.mean(group_embs, axis=0)
        # Re-normalize centroid to unit length
        kb_vectors.append(centroid / np.linalg.norm(centroid))

kb_vectors = np.array(kb_vectors)
print(f"Gold standard {len(kb_vectors)} clusters created")

3. Matching test mentions to KB centroids with FAISS

- Build context strings and embeddings for **test** mentions.
- Index all KB centroids in a FAISS inner-product index.
- For each test mention, retrieve its nearest KB centroid and score.
- Combine semantic similarity (embedding score) with exact name matching to decide whether to attach to an existing KB cluster or mark as unmatched.

In [3]:
# create embeddings for test mentions
df_test['context'] = df_test.apply(create_context, axis=1)
test_embs = normalize(model.encode(df_test['context'].tolist(), show_progress_bar=True))

# Similarity threshold 
THRESHOLD = 0.7

# Build FAISS index over centroids (cosine similarity via inner product on normalized vectors)
dim = kb_vectors.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(kb_vectors.astype('float32'))

print(f"Searching KB index for {len(test_embs)} mentions")
# For each test embedding, get top centroid and similarity score
scores, indices = index.search(test_embs.astype('float32'), 1)

best_match_indices = indices.flatten()
max_scores = scores.flatten()

# assignments[i] = chosen KB cluster index or -1 if we create a new cluster later
assignments = []
# unmatched_indices = positions in the test array that do not attach to any KB cluster
unmatched_indices = []

# Precompute exact name -> KB cluster mapping from train data
id_to_text_map = {row['mention_id']: row['mention'] for row in train_data}
kb_exact_names = {}
for group_idx, m_ids in enumerate(train_labels):
    for m_id in m_ids:
        name = id_to_text_map.get(m_id, "").lower()
        if name:
            kb_exact_names[name] = group_idx

# Lowercased surface forms for test mentions
test_mentions = df_test['mention'].str.lower().values

for i in range(len(test_embs)):
    max_score = max_scores[i]
    best_idx = best_match_indices[i]
    current_text = test_mentions[i]

    # High semantic similarity: directly attach to best KB cluster
    if max_score >= THRESHOLD:
        assignments.append(best_idx)
    # Medium similarity: allow a match if exact surface form exists in the KB
    elif max_score > 0.50 and current_text in kb_exact_names:
        assignments.append(kb_exact_names[current_text])
    # Very low similarity or unseen name: mark as unmatched to be clustered later
    else:
        assignments.append(-1)
        unmatched_indices.append(i)

print(f"Matched {len(assignments) - len(unmatched_indices)} mentions to existing clusters.")
print(f"{len(unmatched_indices)} mentions need new clusters")

4. Create extra clusters from remaining mentions 

- Normalize mention strings and build an abbreviation map (short form â†’ long form).
- Derive a canonical name for each test mention.
- Run HDBSCAN on **only the unmatched** mentions, grouped by entity type (and optionally by first letter) to keep blocks manageable.
- Merge KB assignments and HDBSCAN clusters into a single set of final clusters and write them to `test_labels.json`.

In [None]:
# -------------------------------------------------------------
# Step 1: Normalize names and build abbreviation map
# -------------------------------------------------------------
print("Mapping abbreviations...")
df_test['clean_name'] = df_test['mention'].str.lower().str.replace(r'[^a-z0-9]', '', regex=True)

abbrev_map = {}
rel_mask = df_test['relations'].map(len) > 0
# For mentions that have relations, collect all Abbreviation relations
for row in tqdm(df_test[rel_mask][['clean_name', 'relations']].to_dict('records'), desc="Mapping Abbrevs"):
    long_n = row['clean_name']
    for rel in row['relations']:
        if rel.get('type') == 'Abbreviation':
            short_n = clean_name(rel.get('mention', '')) 
            if long_n and short_n:
                abbrev_map[short_n] = long_n

# Canonical name prefers the long form if we know an abbreviation mapping
df_test['canonical_name'] = df_test['clean_name'].map(abbrev_map).fillna(df_test['clean_name'])


In [None]:
# -------------------------------------------------------------
# Step 2: Cluster unmatched mentions with HDBSCAN
# -------------------------------------------------------------

m_ids = df_test['mention_id'].values
c_names = df_test['canonical_name'].values

# cluster_labels_final[i] stores the HDBSCAN label for unmatched_indices[i]
cluster_labels_final = np.full(len(unmatched_indices), -1, dtype=int)

if unmatched_indices:
    unmatched_embs = test_embs[unmatched_indices].astype('float32') 
    unmatched_types = df_test.iloc[unmatched_indices]['type'].fillna('UNKNOWN').values
    unmatched_names = [c_names[idx] for idx in unmatched_indices] 
    
    next_cluster_id = 0
    unique_types = np.unique(unmatched_types)
    
    print(f"Clustering {len(unmatched_indices)} mentions...")
    for ent_type in tqdm(unique_types, desc="HDBSCAN Blocks"):
        # Work block-wise per entity type
        type_mask = (unmatched_types == ent_type)
        type_indices = np.where(type_mask)[0]
        
        if len(type_indices) < 2: continue

        # Optionally split very large blocks by first letter to speed up HDBSCAN
        if len(type_indices) > 20000:
            first_letters = np.array([str(unmatched_names[i])[0] if unmatched_names[i] else '#' for i in type_indices])
            sub_units = np.unique(first_letters)
        else:
            sub_units = ['ALL']

        for sub in sub_units:
            if sub == 'ALL':
                sub_idx = type_indices
            else:
                sub_mask = (first_letters == sub)
                sub_idx = type_indices[sub_mask]

            if len(sub_idx) < 2: continue

            # Run HDBSCAN on this block of unmatched mentions
            block_model = hdbscan.HDBSCAN(
                min_cluster_size=2, 
                min_samples=1, 
                metric='euclidean', 
                cluster_selection_epsilon=0.5, 
                core_dist_n_jobs=-1,
                prediction_data=False
            )
            block_labels = block_model.fit_predict(unmatched_embs[sub_idx])
            
            # Offset labels so that clusters from different blocks do not collide
            for i, lbl in enumerate(block_labels):
                if lbl != -1:
                    cluster_labels_final[sub_idx[i]] = lbl + next_cluster_id
            
            if block_labels.max() != -1:
                next_cluster_id += block_labels.max() + 1


In [None]:

# -------------------------------------------------------------
# Step 3: Merge KB matches and HDBSCAN clusters into final labels
# -------------------------------------------------------------
final_clusters = {}
name_to_cluster_id = {}
hdb_to_cluster_id = {}
next_singleton_id = 0


print("Finalizing clusters...")
# 1: KB Matches
for i in tqdm(range(len(assignments)), desc="Pass 1: KB"):
    kb_idx = assignments[i]
    if kb_idx != -1:
        target_cluster = f"kb_{kb_idx}"
        name = c_names[i]
        final_clusters.setdefault(target_cluster, []).append(m_ids[i])
        name_to_cluster_id[name] = target_cluster

#2: Unmatched 
# use HDBSCAN labels and canonical names
if unmatched_indices:
    u_names = [c_names[idx] for idx in unmatched_indices]
    u_ids = [m_ids[idx] for idx in unmatched_indices]

    for i in tqdm(range(len(unmatched_indices)), desc="Pass 2: New"):
        label = cluster_labels_final[i]
        name = u_names[i]
        m_id = u_ids[i]
        
        # Reuse existing cluster if we have already seen this canonical name
        if name in name_to_cluster_id:
            target_cluster = name_to_cluster_id[name]
        # Or reuse a cluster ID already assigned to this HDBSCAN label
        elif label != -1 and label in hdb_to_cluster_id:
            target_cluster = hdb_to_cluster_id[label]
        # Otherwise create a new HDBSCAN-based cluster
        elif label != -1:
            target_cluster = f"new_{label}"
            hdb_to_cluster_id[label] = target_cluster
        # Finally, fall back to a singleton cluster for true outliers
        else:
            target_cluster = f"singleton_{next_singleton_id}"
            next_singleton_id += 1
        
        name_to_cluster_id[name] = target_cluster
        if label != -1: hdb_to_cluster_id[label] = target_cluster
        final_clusters.setdefault(target_cluster, []).append(m_id)

test_labels_output = list(final_clusters.values())
with open("test_labels.json", "w") as f:
    json.dump(test_labels_output, f, indent=2)

print(f"Success! Total Clusters: {len(test_labels_output)}")