In [1]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Limit OpenMP Threads to Avoid Conflicts (Try adjusting to `1` or `2` for performance)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["TBB_NUM_THREADS"] = "1"

# Initialize BERT-based embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2',device="cpu")

# Load cleaned Reddit data
df = pd.read_csv("../Dataset/cleaned_healthcare_reddit_data_praw.csv")
# Ensure 'Cleaned_Text' column is valid
df["Cleaned_Text"] = df["Cleaned_Text"].fillna("").astype(str)

# Train BERTopic model
topic_model = BERTopic(embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(df["Cleaned_Text"])

# Get topics and words
topics_dict = topic_model.get_topics()

# Function to get WordNet synonyms
def get_wordnet_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

# Function to get associated topics using WordNet + BERT similarity
def get_associated_topics(keyword, topics_dict, top_n=5):
    keyword_lower = keyword.lower()
    keyword_embedding = embedding_model.encode([keyword_lower])[0]

    associated_topics = {}

    for topic_num, topic_words in topics_dict.items():
        topic_word_embeddings = []

        for word, weight in topic_words:
            word_lower = word.lower()
            if word_lower == keyword_lower:  # Avoid self-matching
                continue

            word_embedding = embedding_model.encode([word_lower])[0]
            cosine_sim = cosine_similarity([keyword_embedding], [word_embedding])[0][0]

            # Merge if similarity is high OR if they are synonyms
            for existing_topic in associated_topics:
                if word in get_wordnet_synonyms(existing_topic) or cosine_similarity(
                        [embedding_model.encode([existing_topic])[0]], [word_embedding])[0][0] > 0.6:
                    associated_topics[existing_topic].append((word, weight, cosine_sim))
                    break
            else:
                associated_topics[word] = [(word, weight, cosine_sim)]

    # Format results
    formatted_topics = []
    for key, values in associated_topics.items():
        grouped_words = ", ".join([f"{w} (Weight: {wt:.4f}, CosSim: {sim:.4f})" for w, wt, sim in values])
        formatted_topics.append(f"{key}: {grouped_words}")

    return formatted_topics[:top_n] if formatted_topics else [f"No associated topics found for '{keyword}'"]

# Test
test_keyword = "overdose"
associated_topics = get_associated_topics(test_keyword, topics_dict)
print(f"🔍 Associated topics for '{test_keyword}':")
for topic in associated_topics:
    print(topic)

# Save topics to CSV
topics_list = []
for topic_num, topic_words in topics_dict.items():
    for word, weight in topic_words:
        word_embedding = embedding_model.encode([word.lower()])[0]
        keyword_embedding = embedding_model.encode(["overdose"])[0]

        cos_sim = cosine_similarity([keyword_embedding], [word_embedding])[0][0]
        topics_list.append({"Topic": topic_num, "Word": word, "Weight": weight, "CosSim": cos_sim})

topics_df = pd.DataFrame(topics_list)
topics_df.to_csv("../Dataset/topics.csv", index=False)

print("✅ Topic modeling complete: Topics saved with cosine similarity.")


AttributeError: module 'numpy' has no attribute '__version__'

In [2]:
!pip uninstall numpy -y


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4


In [3]:
!pip install numpy==1.24.4 --no-cache-dir

Collecting numpy==1.24.4
  Downloading numpy-1.24.4.tar.gz (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[33 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/opt/anaconda3/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 389, in <module>
  [31m   [0m     main()
  [31m   [0m   File "/opt/anaconda3/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 373, in main
  [31m   [0m     json_out["return_val"] = hook(**hook_input["kwargs"])
  [31m   [0m   

In [None]:
conda install python=3.11 -y


Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: - 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/noarch::conda-pack==0.6.0=pyhd3eb1b0_0
  - defaults/osx-arm64::jupyterlab_server==2.22.0=py311hca03da5_0
  - defaults/osx-arm64::aiohttp==3.8.5=py311h80987f9_0
  - defaults/osx-arm64::dask-core==2023.6.0=py311hca03da5_0
  - defaults/osx-arm64::scikit-image==0.20.0=py311h313beb8_0
  - defaults/noarch::requests-file==1.5.1=pyhd3eb1b0_0
  - defaults/osx-arm64::black==23.3.0=py311hca03da5_0
  - defaults/osx-arm64::bokeh==3.2.1=py311hb6e6a13_0
  - defaults/osx-arm64::_anaconda_depends==2023.09=py311_openblas_1
  - defaults/osx-arm64::anaconda-project==0.11.1=py311hca03da5_0
  - defaults/osx-arm64::python-lsp-black==1.2.1=py311hca03da5_0
  - defaults/noarch::asttokens==2.0.5=pyhd3eb1b0_0
  - defaults/osx-arm64::pyqt==5.15.7=py311h313b

unsuccessful initial attempt using frozen solve. Retrying with flexible solve.
Solving environment: / 

In [None]:
conda remove numpy scipy pandas -y
conda clean --all --yes
conda install numpy=1.24.4 scipy=1.10.1 pandas=1.5.3 -c conda-forge
