Note: all code is written in python (https://www.python.org/) and should be compatible with python 3.10 and later versions.

Code author: Matteo Eustachio Peluso (the-puzzler@github)

In [None]:
# imports
from transformers import pipeline
import pandas as pd
import random
import json
import os
from tqdm import tqdm
import warnings
import logging

# This Section Queries LLM For Semantic Clusters

In [None]:
def run_semantic_clustering(
    input_csv_path,
    output_json_path,
    model_name="meta-llama/Llama-3.2-3B-Instruct",
    num_samples=1_000_000,
    sample_size=20,
    batch_size=128,
    save_interval=10_000,
    max_new_tokens=256,
    device=0
):
    """
    Generate semantic clusters from word samples using a language model.
    
    Args:
        input_csv_path (str): Path to input CSV file containing words
        output_json_path (str): Path to save output JSON file
        model_name (str): Hugging Face model name
        num_samples (int): Number of random samples to generate
        sample_size (int): Number of words per sample
        batch_size (int): Processing batch size
        save_interval (int): Save outputs every N samples
        max_new_tokens (int): Maximum tokens for model generation
        device (int): GPU device ID (use -1 for CPU)
    """
    
    # Setup logging
    logging.getLogger("transformers").setLevel(logging.ERROR)
    warnings.filterwarnings("ignore")
    
    # Initialize pipeline
    pipe = pipeline(
        "text-generation", 
        model=model_name, 
        device=device if device >= 0 else None,
        max_new_tokens=max_new_tokens
    )
    pipe.tokenizer.pad_token_id = pipe.tokenizer.eos_token_id
    
    # Load and prepare data
    print("Loading data...")
    df = pd.read_csv(input_csv_path)
    word_list = [item for sublist in df.values.tolist() for item in sublist]
    
    print(f"Generating {num_samples} random samples...")
    random_samples = [
        random.sample(word_list, sample_size) 
        for _ in range(num_samples)
    ]
    
    # Format prompts
    print("Formatting prompts...")
    formatted_prompts = []
    for sample in random_samples:
        sample_text = ", ".join(sample)
        prompt = [
            {
                "role": "system", 
                "content": "respond with nothing else other than a numbered list of comma seperated words, no other words, no explanations, nothing else. Return the words exactly as they come, no captialisaion or anything else."
            },
            {
                "role": "user", 
                "content": f"List of words: {sample_text}. Cluster these random words semantically:"
            }
        ]
        formatted_prompts.append(prompt)
    
    # Process in batches
    outputs = []
    total_batches = len(formatted_prompts) // batch_size + (1 if len(formatted_prompts) % batch_size != 0 else 0)
    
    for i in tqdm(range(0, len(formatted_prompts), batch_size), total=total_batches, desc="Processing batches"):
        batch = formatted_prompts[i:i+batch_size]
        batch_outputs = pipe(
            batch,
            batch_size=batch_size,
            max_new_tokens=max_new_tokens
        )
        outputs.extend(batch_outputs)
        
        # Save periodically
        if len(outputs) >= save_interval:
            _save_outputs(outputs, output_json_path)
            outputs = []
    
    # Save remaining outputs
    if outputs:
        _save_outputs(outputs, output_json_path)
    
    print("Processing complete")


def _save_outputs(new_outputs, output_path):
    """Helper function to save outputs, appending to existing file if it exists."""
    existing_outputs = []
    if os.path.exists(output_path):
        with open(output_path, 'r') as f:
            existing_outputs = json.load(f)
    
    all_outputs = existing_outputs + new_outputs
    
    with open(output_path, 'w') as f:
        json.dump(all_outputs, f, indent=2)
    
    print(f"Saved {len(all_outputs)} total outputs to {output_path}")


# Example usage
if __name__ == "__main__":
    run_semantic_clustering(
        input_csv_path="../data/input_words.csv",
        output_json_path="semantic_clusters.json",
        num_samples=1_000_000,
        sample_size=20,
        batch_size=128,
        save_interval=10_000
    )

# This Section Calculates Semantic Distances Based on Cluster Membership

#### This is MP Code, should be called from a seperate file

In [None]:
from difflib import SequenceMatcher
from multiprocessing import Pool
import tqdm

def similarity_ratio(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def find_best_matching_input_word(output_word, input_words, threshold=0.85):
    best_match = max(((word, similarity_ratio(output_word, word)) 
                     for word in input_words), 
                    key=lambda x: x[1])
    return best_match[0] if best_match[1] >= threshold else None

def process_cluster_sample(sample):
    try:
        # Extract input words
        input_content = next(msg["content"] for msg in sample[0]["generated_text"] 
                           if msg["role"] == "user")
        words_section = input_content[input_content.find("words: ") + 7:].split(".")[0]
        input_words = [w.strip() for w in words_section.split(",")]
        
        # Extract and process clusters
        output_content = next(msg["content"] for msg in sample[0]["generated_text"] 
                            if msg["role"] == "assistant")
        
        clusters = []
        used_words = set()
        
        for line in output_content.split('\n'):
            if not line.strip() or not any(c.isdigit() for c in line):
                continue
                
            words_part = line.split('.', 1)[1].strip() if '.' in line else line.lstrip("0123456789 .")
            output_words = [w.strip() for w in words_part.split(",")]
            
            valid_words = []
            cluster_used_words = set()
            
            for output_word in output_words:
                matched_word = find_best_matching_input_word(output_word, input_words)
                if matched_word and matched_word not in cluster_used_words:
                    valid_words.append(matched_word)
                    cluster_used_words.add(matched_word)
            
            if valid_words:
                clusters.append(valid_words)
        
        return clusters
    
    except Exception as e:
        print(f"Error processing sample: {str(e)}")
        return []

def process_all_clusters_parallel(sample_data, num_processes=None, verbose=False):
   with Pool(processes=num_processes) as pool:
       if verbose:
           results = list(tqdm.tqdm(
               pool.imap(process_cluster_sample, sample_data),
               total=len(sample_data),
               desc="Processing samples"
           ))
       else:
           results = list(pool.imap(process_cluster_sample, sample_data))
   
   successful = sum(1 for r in results if r)
   if verbose:
       print(f"\nSuccessfully processed {successful} out of {len(sample_data)} samples")
   return [r for r in results if r]




#### This code takes the parsed data from the MP code and calculates the distances

In [None]:
import numpy as np
from tqdm import tqdm

def compute_term_relationships(contexts):
    """
    Compute relationship scores between terms based on their cluster co-occurrences.
    
    Args:
        contexts (list): List of contexts, where each context is a list of clusters,
                        and each cluster is a collection of terms.
    
    Returns:
        dict: Dictionary containing:
            - 'relationship_matrix': numpy array of relationship scores
            - 'term_to_idx': mapping of terms to matrix indices
            - 'all_terms': list of all unique terms
    """
    # Collect all unique terms
    all_terms = set()
    for context in contexts:
        for cluster in context:
            all_terms.update(cluster)
    all_terms = list(all_terms)
    term_to_idx = {term: idx for idx, term in enumerate(all_terms)}

    # Initialize matrices
    n_terms = len(all_terms)
    same_cluster_count = np.zeros((n_terms, n_terms))
    same_context_count = np.zeros((n_terms, n_terms))
    different_cluster_count = np.zeros((n_terms, n_terms))

    # Count co-occurrences
    for context in tqdm(contexts, desc="Processing contexts", unit="context"):
        terms_in_context = set()
        cluster_membership = {}

        # First pass: collect terms and their cluster assignments
        for cluster_idx, cluster in enumerate(context):
            for term in cluster:
                terms_in_context.add(term)
                cluster_membership[term] = cluster_idx

        # Second pass: update counts
        for term1 in terms_in_context:
            idx1 = term_to_idx[term1]
            for term2 in terms_in_context:
                if term1 != term2:
                    idx2 = term_to_idx[term2]
                    same_context_count[idx1][idx2] += 1

                    if cluster_membership[term1] == cluster_membership[term2]:
                        same_cluster_count[idx1][idx2] += 1
                    else:
                        different_cluster_count[idx1][idx2] += 1

    # Calculate relationship scores
    relationship_matrix = np.zeros((n_terms, n_terms))
    for i in tqdm(range(n_terms), desc="Computing relationship matrix", unit="term"):
        for j in range(n_terms):
            if i != j and same_context_count[i][j] > 0:
                score = (same_cluster_count[i][j] - different_cluster_count[i][j]) / same_context_count[i][j]
                relationship_matrix[i][j] = score

    return {
        'relationship_matrix': relationship_matrix,
        'term_to_idx': term_to_idx,
        'all_terms': all_terms
    }


#### This code puts it all together, calls the multiprocessing parsing and cleaning step, then calculates distances between terms.

In [None]:
import json
import numpy as np
import multiprocessing
import h5py

def analyze_clustering_results(
   json_file_path,
   num_processes=None
):
   """
   Analyze clustering results from JSON file and compute term relationships.
   
   Args:
       json_file_path (str): Path to the JSON file containing clustering results
       num_processes (int): Number of processes for parallel processing. 
                          If None, uses (CPU cores - 1)
   
   Returns:
       dict: Dictionary containing:
           - relationship_matrix: Matrix of term relationships
           - term_to_idx: Mapping from terms to matrix indices
           - all_terms: List of all unique terms
   """
   import json
   import multiprocessing
   
   # Load data
   with open(json_file_path, 'r') as f:
       sample_data = json.load(f)
   
   # Set number of processes
   if num_processes is None:
       num_cores = multiprocessing.cpu_count()
       num_processes = max(1, num_cores - 1)
   
   # Process clusters and compute relationships
   all_cluster_results = process_all_clusters_parallel(
       sample_data, 
       num_processes=num_processes
   )
   
   results = compute_term_relationships(all_cluster_results)
   
   return {
       'relationship_matrix': results['relationship_matrix'],
       'term_to_idx': results['term_to_idx'],
       'all_terms': results['all_terms']
   }


if __name__ == "__main__":
   results = analyze_clustering_results('../data/cluster_words3.json')
   relationship_matrix = results['relationship_matrix']
   term_to_idx = results['term_to_idx']
   all_terms = results['all_terms']
   
   print(f"Loaded {len(all_terms)} unique terms")
   print(f"Relationship matrix shape: {relationship_matrix.shape}")