**Topic Modeling**

In [None]:
#import libraries
import pandas as pd


In [None]:
##Load the CSV file
df = pd.read_csv("reddit_with_roberta_sentiment.csv")

#### Install CorEx for Topic Modeling


In [None]:
!pip install corextopic

Collecting corextopic
  Downloading corextopic-1.1-py3-none-any.whl.metadata (12 kB)
Downloading corextopic-1.1-py3-none-any.whl (27 kB)
Installing collected packages: corextopic
Successfully installed corextopic-1.1


In [None]:
##import Required Libraries for Topic Modeling
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#Prepare Clean Text Data for Vectorization, ensure no NaNs
texts = df['lemmas_comment'].dropna().astype(str).tolist()


In [None]:
## Convert the text into a binary bag-of-words matrix and Extract feature names (vocabulary terms)

vectorizer = CountVectorizer(max_features=5000, stop_words='english', binary=True)
X = vectorizer.fit_transform(texts)
words = list(vectorizer.get_feature_names_out())


In [None]:
#Fit CorEx Topic Model (Unsupervised, No Anchors)

model = ct.Corex(n_hidden=10, words=words, seed=42)
model.fit(X, words=words)

#Extract and print the top 10 words for each discovered topic
for i, topic in enumerate(model.get_topics(n_words=10)):
    topic_words = [t[0] for t in topic]  # Take only the word part
    print(f"Topic {i+1}: {', '.join(topic_words)}")



Topic 1: eat, fruit, food, meat, healthy, vegetable, sugar, veggie, chicken, bean
Topic 2: day, like, time, week, work, thing, start, lot, try, want
Topic 3: fat, protein, calorie, diet, high, weight, carb, intake, low, muscle
Topic 4: body, need, long, nutrient, think, term, different, benefit, example, increase
Topic 5: magnesium, vitamin, mg, d3, k2, form, glycinate, dose, zinc, acid
Topic 6: people, risk, disease, say, bad, nutrition, mean, cancer, fact, science
Topic 7: study, health, know, research, link, year, base, evidence, result, actually
Topic 8: help, cause, anxiety, feel, issue, effect, sleep, gut, symptom, energy
Topic 9: brand, product, buy, good, quality, company, thorne, amazon, ingredient, price
Topic 10: blood, test, level, supplement, doctor, iron, deficiency, testing, check, lab


In [None]:
from corextopic import corextopic as ct

# Define anchor topics
anchors = [
    ['fatigue', 'tired', 'energy', 'exhausted'],           # Fatigue / Energy
    ['sleep', 'melatonin', 'insomnia', 'rest'],            # Sleep
    ['anxiety', 'stress', 'calm', 'relax'],                # Anxiety / Stress Relief
    ['focus', 'adhd', 'clarity', 'attention'],             # Cognitive / Focus
    ['gut', 'digestion', 'bloating', 'probiotic'],         # Gut Health
    ['acne', 'skin', 'glow', 'clear'],                     # Skin / Beauty
    ['immunity', 'cold', 'sick', 'flu']                    # Immunity
]

#Initialize the CorEx model with anchors and fit the model

n_topics = 10
model = ct.Corex(n_hidden=n_topics, words=words, seed=42)
model.fit(X, words=words, anchors=anchors, anchor_strength=3)  # You can increase strength up to 5

# Show topics
for i, topic in enumerate(model.get_topics(n_words=10)):
    topic_words = [t[0] for t in topic]
    print(f"Topic {i+1}: {', '.join(topic_words)}")


Topic 1: energy, fat, weight, body, calorie, diet, need, carb, tired, intake
Topic 2: sleep, rest, mg, day, magnesium, glycinate, insomnia, melatonin, week, time
Topic 3: anxiety, stress, cause, help, calm, level, effect, blood, acid, issue
Topic 4: focus, people, health, study, adhd, say, think, nutrition, attention, risk
Topic 5: gut, probiotic, digestion, bacteria, microbiome, bloating, ferment, strain, prebiotic, digestive
Topic 6: skin, clear, thing, year, acne, lot, bad, ago, change, live
Topic 7: sick, cold, come, process, want, way, source, probably, small, especially
Topic 8: eat, fruit, food, protein, meat, healthy, vegetable, sugar, veggie, bean
Topic 9: supplement, vitamin, test, product, thorne, company, research, multivitamin, know, look
Topic 10: like, good, high, oil, brand, buy, quality, use, ingredient, fish


In [None]:
# Get topic distribution per document (comment)
topic_matrix = model.transform(X)

# Assign each comment to the topic with the highest score
dominant_topics = topic_matrix.argmax(axis=1)

# Add this back to your original DataFrame
df['cor_ex_topic'] = dominant_topics


**title-based topic modeling**

In [None]:
# Drop duplicates — keep only one entry per Post_ID
unique_titles_df = df.drop_duplicates(subset='Post_ID')[['Post_ID', 'lemmas_title']]


In [None]:
# Convert list of lemmas to string
# Clean list of lemmas to string
unique_titles_df['lemmas_title_str'] = unique_titles_df['lemmas_title'].astype(str).str.replace(r"[\[\]',]", '', regex=True)
texts_title = unique_titles_df['lemmas_title_str'].tolist()



In [None]:
#Vectorize Lemmatized Post Titles

from sklearn.feature_extraction.text import CountVectorizer

vectorizer_title = CountVectorizer(max_features=3000, stop_words='english', binary=True)
X_title = vectorizer_title.fit_transform(texts_title)
words_title = list(vectorizer_title.get_feature_names_out())


In [None]:
#Apply CorEx Topic Modeling on Post Titles

from corextopic import corextopic as ct

n_topics = 8
corex_title = ct.Corex(n_hidden=n_topics, words=words_title, seed=42)
corex_title.fit(X_title, words=words_title)

for i, topic in enumerate(corex_title.get_topics(n_words=10)):
    topic_words = [t[0] for t in topic]
    print(f"Topic {i+1}: {', '.join(topic_words)}")


Topic 1: depression, try, anxiety, male, reaction, week, old, stress, caffeine, goodbye
Topic 2: pure, brand, encapsulation, extract, calcium, doctor, bottle, purchase, flavor, like
Topic 3: d3, iu, powder, k2, heart, blood, protein, creatine, brain, pressure
Topic 4: weight, gain, loss, naturemade, start, berberine, constipation, focus, lose, quality
Topic 5: eat, day, meal, food, calorie, fat, body, vegetable, process, water
Topic 6: supplement, multivitamin, help, vs, man, advice, gardenoflife, vitamin, need, centrum
Topic 7: magnesium, oil, glycinate, fish, cause, stop, pain, disease, stomach, citrate
Topic 8: look, probiotic, think, work, pill, source, right, ingredient, change, ritual


In [None]:
df.rename(columns={'cor_ex_topic': 'cor_ex_topic_comments'}, inplace=True)


**Print and Compare Title vs Comment Topics**

In [None]:
# Assuming model_title is your CorEx model from titles
print("Title Topics:")
for i, topic in enumerate(corex_title.get_topics(n_words=10)):
    words = [t[0] for t in topic]
    print(f"Topic {i}: {', '.join(words)}")

# Assuming model_comments is your CorEx model from comments
print("\nComment Topics:")
for i, topic in enumerate(model.get_topics(n_words=10)):
    words = [t[0] for t in topic]
    print(f"Topic {i}: {', '.join(words)}")


Title Topics:
Topic 0: depression, try, anxiety, male, reaction, week, old, stress, caffeine, goodbye
Topic 1: pure, brand, encapsulation, extract, calcium, doctor, bottle, purchase, flavor, like
Topic 2: d3, iu, powder, k2, heart, blood, protein, creatine, brain, pressure
Topic 3: weight, gain, loss, naturemade, start, berberine, constipation, focus, lose, quality
Topic 4: eat, day, meal, food, calorie, fat, body, vegetable, process, water
Topic 5: supplement, multivitamin, help, vs, man, advice, gardenoflife, vitamin, need, centrum
Topic 6: magnesium, oil, glycinate, fish, cause, stop, pain, disease, stomach, citrate
Topic 7: look, probiotic, think, work, pill, source, right, ingredient, change, ritual

Comment Topics:
Topic 0: energy, fat, weight, body, calorie, diet, need, carb, tired, intake
Topic 1: sleep, rest, mg, day, magnesium, glycinate, insomnia, melatonin, week, time
Topic 2: anxiety, stress, cause, help, calm, level, effect, blood, acid, issue
Topic 3: focus, people, heal

In [None]:
##clear topic labels

title_topic_labels = {
    0: "Mental Health / Mood",
    1: "Brand / Product Quality",
    2: "Vitamin D3 / Brain / Creatine",
    3: "Weight / Gut / Metabolism",
    4: "Diet / Food Habits",
    5: "General Supplement Advice",
    6: "Magnesium / Pain",
    7: "Probiotics / Effectiveness"
}

comment_topic_labels = {
    0: "Fatigue / Weight / Nutrition",
    1: "Sleep / Magnesium",
    2: "Stress / Anxiety",
    3: "Focus / ADHD",
    4: "Gut Health",
    5: "Skin / Acne",
    6: "Immunity / Cold",
    7: "Food / Diet",
    8: "Brands / Product Research",
    9: "Chatter / General Advice"
}


In [None]:
#### Rename Title Topic Column
df.rename(columns={'cor_ex_topic': 'cor_ex_topic_title'}, inplace=True)


In [None]:
supplement_whitelist = {
    "thorne", "oneaday", "megafood", "pureencapsulations", "kleanathlete", "ritual",
    "gardenoflife", "mykind", "smartypants", "olly", "centrum", "naturemade",
    "renzo", "viridian", "purolabs", "puro", "vitafusion", "weightworld"
}

In [None]:
from collections import Counter

# Step 1: Flatten and count supplements within each title topic
supplements_by_topic = (
    unique_titles_df.groupby('title_topic_label')['supplements_in_title']
    .apply(lambda lists: Counter([item for sublist in lists if isinstance(sublist, list)
                                  for item in sublist]))
    .reset_index(name='supplement_counts')
)

# Step 2: Extract top supplement and count safely
def extract_top_supplement(counter):
    if isinstance(counter, Counter) and len(counter) > 0:
        return counter.most_common(1)[0]
    return (None, 0)

supplements_by_topic[['top_supplement', 'mention_count']] = supplements_by_topic['supplement_counts'] \
    .apply(extract_top_supplement) \
    .apply(pd.Series)

# Final result
top_supplement_per_topic = supplements_by_topic[['title_topic_label', 'top_supplement', 'mention_count']]
top_supplement_per_topic



Unnamed: 0,title_topic_label,top_supplement,mention_count
0,Brand / Product Quality,,0.0
1,Brand / Product Quality,,0.0
2,Brand / Product Quality,,0.0
3,Brand / Product Quality,,0.0
4,Brand / Product Quality,,0.0
...,...,...,...
99,Weight / Gut / Metabolism,,0.0
100,Weight / Gut / Metabolism,,0.0
101,Weight / Gut / Metabolism,,0.0
102,Weight / Gut / Metabolism,,0.0


In [None]:
unique_titles_df['lemmas_title_str'] = unique_titles_df['lemmas_title_str'].str.lower()


In [None]:
print(supplements_by_topic)

             title_topic_label             level_1  supplement_counts  \
0      Brand / Product Quality              thorne                9.0   
1      Brand / Product Quality            megafood                1.0   
2      Brand / Product Quality  pureencapsulations                3.0   
3      Brand / Product Quality        gardenoflife                8.0   
4      Brand / Product Quality             centrum                9.0   
..                         ...                 ...                ...   
99   Weight / Gut / Metabolism         weightworld                NaN   
100  Weight / Gut / Metabolism          naturemade                8.0   
101  Weight / Gut / Metabolism          vitafusion                NaN   
102  Weight / Gut / Metabolism         smartypants                1.0   
103  Weight / Gut / Metabolism              mykind                NaN   

     top_supplement  mention_count  
0               NaN            0.0  
1               NaN            0.0  
2           

In [None]:
# Step: Get top (most frequent) supplement per topic
top_supplement_per_topic = (
    supplements_by_topic.sort_values('supplement_counts', ascending=False)
    .groupby('title_topic_label')
    .first()
    .reset_index()
)

# Preview the result
top_supplement_per_topic


Unnamed: 0,title_topic_label,level_1,supplement_counts
0,Brand / Product Quality,thorne,9.0
1,Diet / Food Habits,centrum,2.0
2,General Supplement Advice,thorne,5.0
3,Magnesium / Pain,olly,5.0
4,Mental Health / Mood,gardenoflife,32.0
5,Probiotics / Effectiveness,ritual,6.0
6,Vitamin D3 / Brain / Creatine,olly,3.0
7,Weight / Gut / Metabolism,thorne,13.0


In [None]:
# Ensure whitelist is lowercase
supplement_whitelist = {s.lower() for s in supplement_whitelist}

# Lowercase title strings
unique_titles_df['lemmas_title_str'] = unique_titles_df['lemmas_title_str'].str.lower()


In [None]:
# Extract supplements mentioned in each title
def find_supplements(text):
    return [s for s in supplement_whitelist if s in text]

unique_titles_df['supplements_in_title'] = unique_titles_df['lemmas_title_str'].apply(find_supplements)


In [None]:
# Count
supplements_by_topic = (
    unique_titles_df.groupby('title_topic_label')['supplements_in_title']
    .apply(lambda lists: Counter([item for sublist in lists for item in sublist if isinstance(sublist, list)]))
    .reset_index(name='supplement_counts')
)

# Extract most common
supplements_by_topic[['top_supplement', 'mention_count']] = supplements_by_topic['supplement_counts'] \
    .apply(lambda c: c.most_common(1)[0] if isinstance(c, Counter) and len(c) > 0 else (None, 0)) \
    .apply(pd.Series)


In [None]:
unique_titles_df[['lemmas_title_str', 'supplements_in_title']].sample(5)


Unnamed: 0,lemmas_title_str,supplements_in_title
30733,2 magnesium glycinate dose,[]
20851,lose weight eat calorie day fast multiple smal...,[]
4648,boron supplementation,[]
12414,zinc gummie taste awful,[]
2626,think thorne blood test,[thorne]
