In [None]:
# Generates a graph based on semantic similarities between tokens
# This is done to compare the quality of graphs generated by a language model, vs sentence embeddings
# TODO: use similarity cutoffs rather than hardcoded number of connections
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import os

MODEL_NAME = 'all-MiniLM-L6-v2'
MAX_CONNECTIONS = 12
IN_PATH = "path/to/input/vocab.txt" 

def load_vocab(path):
    with open(path, 'r') as f:
        return [line.strip() for line in f if line.strip()]

def generate_semantic_graph(vocab, model_name, max_connections):
    model = SentenceTransformer(model_name)
    
    # Calculate embeddings for all words at once
    embeddings = model.encode(vocab, show_progress_bar=False)
    
    # Calculate pairwise similarities using matrix operations
    similarity_matrix = cosine_similarity(embeddings)
    
    semantic_graph = {}
    for i, word in enumerate(vocab):
        # Get similarities for this word with all other words
        similarities = similarity_matrix[i]
        
        # Sort similarities, excluding self
        sorted_indices = np.argsort(similarities)[::-1][1:max_connections+1]
        
        # Get the most similar words
        similar_words = [vocab[j] for j in sorted_indices]
        
        semantic_graph[word] = similar_words
    
    return semantic_graph

# Load the vocabulary
vocab = load_vocab(path=IN_PATH)

# Generate a new graph based on semantic similarities
semantic_graph = generate_semantic_graph(vocab, MODEL_NAME, MAX_CONNECTIONS)

# Generate output file path based on input parameters
base_name = os.path.splitext(os.path.basename(IN_PATH))[0]
OUT_PATH = f"{base_name}_semantic_M{MODEL_NAME.replace('-', '_')}_C{MAX_CONNECTIONS}.json"

# Save the semantically generated graph
with open(OUT_PATH, 'w') as f:
    json.dump(semantic_graph, f)