# Topic Coherence Evaluation

## Libraries

In [None]:
# Install required libraries
!pip install --quiet numpy==1.26.4
!pip install --quiet bertopic==0.16.3 safetensors==0.4.5 plotly==5.15.0 gensim==4.3.3
import pickle
from bertopic import BERTopic
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

## Load Topic Model

In [None]:
# --- 1. Load Model, Abstracts, and Labels ---
print("Loading topic model, abstracts, and labels...")

# Load the saved topic model
# Ensure this path is correct for your environment
topic_model_path = "Topic Model/09OCT24/topic_model"
topic_model = BERTopic.load(topic_model_path)
print(f"Topic model loaded from: {topic_model_path}")

Loading topic model, abstracts, and labels...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Topic model loaded from: Topic Model/09OCT24/topic_model


In [None]:
# Load the GPT-4o labels
gpt4o_labels_path = 'Topic Model/09OCT24/gpt4o_labels.pickle'
with open(gpt4o_labels_path, 'rb') as handle:
    gpt4o_labels = pickle.load(handle)
print(f"GPT-4o labels loaded from: {gpt4o_labels_path}")

GPT-4o labels loaded from: Topic Model/09OCT24/gpt4o_labels.pickle


In [None]:
# Load original dataframe to get abstracts
data_path = 'Files/embeddings_full_tSNE_uMAP_01MAR2024.h5'
df = pd.read_hdf(data_path, key='embeddings')
df_filtered = df[['title', 'abstract', 'journal_title', 'pub_year', 'authors', 'tsne_2D_x', 'tsne_2D_y']]
abstracts = df_filtered["abstract"].tolist() #
print(f"Abstracts loaded. Total abstracts: {len(abstracts)}")

Abstracts loaded. Total abstracts: 80656


In [None]:
abstracts[0]

'Escherichia coli growing on glucose in minimal medium controls its metabolite pools in response to environmental conditions. The extent of pool changes was followed through two-dimensional thin-layer chromatography of all 14C-glucose labelled compounds extracted from bacteria. The patterns of metabolites and spot intensities detected by phosphorimaging were found to reproducibly differ depending on culture conditions. Clear trends were apparent in the pool sizes of several of the 70 most abundant metabolites extracted from bacteria growing in glucose-limited chemostats at different growth rates. The pools of glutamate, aspartate, trehalose, and adenosine as well as UDP-sugars and putrescine changed markedly. The data on pools observed by two-dimensional thin-layer chromatography were confirmed for amino acids by independent analysis. Other unidentified metabolites also displayed different spot intensities under various conditions, with four trend patterns depending on growth rate. As 

In [None]:
# Set custom labels to the model
# This ensures topic_info contains the 'CustomName' correctly aligned with gpt4o_labels
topic_model.set_topic_labels(gpt4o_labels) #
print("Custom labels set on the topic model.")

Custom labels set on the topic model.


## Extract Topics

In [None]:
#--- 2. Extract Topics (Top N Words per Topic) ---
print("\nExtracting topics (top words)...")
topic_info_df = topic_model.get_topic_info()

# Filter out the outlier topic (-1)
valid_topics_info = topic_info_df[topic_info_df.Topic != -1].copy() # Use .copy() to avoid SettingWithCopyWarning

topics_to_evaluate = []
custom_topic_names_ordered = []

for index, row in valid_topics_info.iterrows():
    topic_id = row['Topic']
    custom_name = row['CustomName']
    # Get top N words for the current topic_id (default is 10 from BERTopic training)
    words = [word for word, _ in topic_model.get_topic(topic_id)]
    if words and len(words) > 1: # Coherence usually needs at least 2 words
        topics_to_evaluate.append(words)
        custom_topic_names_ordered.append(custom_name)
    else:
        print(f"Topic {topic_id} ('{custom_name}') has insufficient words for coherence calculation or is empty.")

if not topics_to_evaluate:
    print("No valid topics found to evaluate for coherence. Exiting.")
    exit()
else:
    print(f"Extracted {len(topics_to_evaluate)} topics for coherence evaluation.")



Extracting topics (top words)...
Extracted 20 topics for coherence evaluation.


## Preprocess Documents (Tokenization)

In [None]:
## Preprocess Documents (Tokenization)
print("\nPreprocessing documents (tokenizing abstracts)...")
# Using gensim's simple_preprocess for basic tokenization (lowercase, remove punctuation, etc.)
tokenized_abstracts = [simple_preprocess(doc) for doc in abstracts]
print("Tokenization complete.")


Preprocessing documents (tokenizing abstracts)...
Tokenization complete.


## Create Gensim Dictionary



In [None]:
# --- 4. Create Gensim Dictionary ---
print("\nCreating Gensim dictionary...")
dictionary = Dictionary(tokenized_abstracts)
print(f"Dictionary created with {len(dictionary)} unique tokens.")


Creating Gensim dictionary...
Dictionary created with 111772 unique tokens.


## Calculate Coherence


In [None]:
# --- 5. Calculate Coherence using Gensim's CoherenceModel ---
print("\nCalculating Coherence Scores...")

if topics_to_evaluate and dictionary and tokenized_abstracts:
    # Calculate C_v coherence
    print("Calculating C_v coherence...")
    coherence_model_cv = CoherenceModel(topics=topics_to_evaluate,
                                        texts=tokenized_abstracts,
                                        dictionary=dictionary,
                                        coherence='c_v')
    coherence_cv_avg = coherence_model_cv.get_coherence()
    per_topic_coherence_cv = coherence_model_cv.get_coherence_per_topic()
    print(f"Average Coherence Score (C_v): {coherence_cv_avg:.4f}")

    # Calculate C_npmi coherence
    print("\nCalculating C_npmi coherence...")
    coherence_model_npmi = CoherenceModel(topics=topics_to_evaluate,
                                          texts=tokenized_abstracts,
                                          dictionary=dictionary,
                                          coherence='c_npmi')
    coherence_npmi_avg = coherence_model_npmi.get_coherence()
    per_topic_coherence_npmi = coherence_model_npmi.get_coherence_per_topic()
    print(f"Average Coherence Score (C_npmi): {coherence_npmi_avg:.4f}")

    # --- 6. Report Per-Topic Scores ---
    print("\n--- Per-Topic Coherence Scores ---")
    results_data = []
    for i, custom_name in enumerate(custom_topic_names_ordered):
        cv_score = per_topic_coherence_cv[i] if per_topic_coherence_cv else float('nan')
        npmi_score = per_topic_coherence_npmi[i] if per_topic_coherence_npmi else float('nan')
        # Original topic ID can be retrieved from valid_topics_info if needed
        original_topic_id = valid_topics_info.iloc[i]['Topic']
        print(f"Topic ID {original_topic_id}: '{custom_name}' -> C_v = {cv_score:.4f}, C_npmi = {npmi_score:.4f}")
        results_data.append({
            "Topic ID": original_topic_id,
            "CustomName": custom_name,
            "C_v": cv_score,
            "C_npmi": npmi_score,
            "Words": topics_to_evaluate[i][:10] # Show top 10 words used for coherence
        })

    results_df = pd.DataFrame(results_data)
    print("\n--- Coherence Results Summary (DataFrame) ---")
    print(results_df.to_string())

    # Optional: Save results to CSV
    # results_df.to_csv("topic_coherence_results.csv", index=False)
    # print("\nResults saved to topic_coherence_results.csv")

else:
    print("Could not calculate coherence. Check if topics were extracted and dictionary/texts are available.")

print("\nCoherence evaluation finished.")


Calculating Coherence Scores...
Calculating C_v coherence...
Average Coherence Score (C_v): 0.7311

Calculating C_npmi coherence...
Average Coherence Score (C_npmi): 0.1698

--- Per-Topic Coherence Scores ---
Topic ID 0: 'Plant Stress Response Mechanisms' -> C_v = 0.5633, C_npmi = 0.0360
Topic ID 1: 'Metabolic Profiles and Dysregulation' -> C_v = 0.7129, C_npmi = 0.1095
Topic ID 2: 'Cancer Metabolism and Therapy Resistance' -> C_v = 0.7195, C_npmi = 0.1225
Topic ID 3: 'Metabolomics Data Analysis and Integration' -> C_v = 0.5995, C_npmi = 0.1241
Topic ID 4: 'Gut Microbiota and Metabolomic Interactions' -> C_v = 0.8627, C_npmi = 0.2206
Topic ID 5: 'Metabolomics in Neurodegenerative Disorders' -> C_v = 0.7195, C_npmi = 0.1419
Topic ID 6: 'Environmental Toxicology and Metabolism' -> C_v = 0.7347, C_npmi = 0.1282
Topic ID 7: 'Metabolomics in Animal Nutrition' -> C_v = 0.5604, C_npmi = 0.0768
Topic ID 8: 'Microbiota-Gut-Brain Axis Interactions' -> C_v = 0.6184, C_npmi = 0.0403
Topic ID 9: '