# Topic Detection

## Description
Perform topic detection.

## Setup

### Libraries (Pyhton)

In [1]:
from bertopic import BERTopic
from bertopic.representation import OpenAI
from huggingface_hub import InferenceClient
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from getpass import getpass
import community as community_louvain
from dotenv import load_dotenv
import hdbscan
import igraph as ig
import json
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import openai
import os
import pandas as pd
from pathlib import Path
import sys
from tqdm import tqdm

### Directories

In [2]:
try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    BASE_DIR = Path().resolve()

SRC_DIR = BASE_DIR / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

DATA_DIR = BASE_DIR / 'data'
DATA_DIR.mkdir(exist_ok=True)
POSTS_DIR = DATA_DIR / 'posts'
POSTS_ALL_DIR = POSTS_DIR / 'all'
POSTS_FILTERED_DIR = POSTS_DIR / 'filtered'
POSTS_FILTERED_CLEAN_DIR = POSTS_FILTERED_DIR / 'clean'
COMMENTS_DIR = DATA_DIR / 'comments'
COMMENTS_CLEAN_DIR = COMMENTS_DIR / 'clean'
RESULTS_DIR = BASE_DIR / 'results'
RESULTS_GRAPHS_DIR = RESULTS_DIR / 'graphs'
RESULTS_MODELS_DIR = RESULTS_DIR / 'models'
RESULTS_MODELS_DIR.mkdir(exist_ok=True)

### Libraries (Custom)

In [3]:
import preprocess as prep

## Import Data

In [4]:
# Graph
filename = RESULTS_GRAPHS_DIR / 'g_dd.graphml'
g_dd_nx = nx.read_graphml(str(filename))
g_dd = ig.Graph.from_networkx(g_dd_nx)

print(g_dd.summary())

IGRAPH UNW- 15819 924974 -- 
+ attr: edge_default (g), node_default (g), _nx_name (v), name (v), weight (e)


In [5]:
# Original documents
filename_df = DATA_DIR / 'docs_dd_giant.json'
df_dd = pd.read_json(str(filename_df), lines=True)

In [None]:
# Check basic stats
print("--- Document-Projected Network (Giant Component) ---")
print(f"Size: {g_dd.vcount()}")
print(f"Average degree: {np.mean(g_dd.degree()):.2f}")
print(f"Median degree: {np.median(g_dd.degree()):.2f}")
print(f"Density: {g_dd.density():.5f}")
print(f"Diameter: {g_dd.diameter(directed=False, unconn=False):.2f}")
print(f"Average path length: {g_dd.average_path_length():.5f}")

--- Document-Projected Network (Giant Component) ---
Size: 15819
Average degree: 116.94
Median degree: 50.00
Density: 0.00739
Diameter: 6.00
Average path length: 2.51146


### Retrieve Original Text [NO NEED TO RUN]
Retrieve the original non-processed text from the documents in the network (giant component of Pdd).

In [None]:
# --- Prep posts ---
filename_posts = POSTS_FILTERED_CLEAN_DIR / 'all_posts_clean.json'
df_posts = pd.read_json(str(filename_posts))

# Concantenate original test into a single column
df_posts["og_text"] = df_posts["title"].fillna('') + " " + df_posts["selftext"].fillna('')

# Rename document ID column
df_posts.rename(columns={"id": "document_id"}, inplace=True)

In [None]:
# --- Prep comments ---
filename_comments = COMMENTS_CLEAN_DIR / 'all_comments_clean.json'
df_comments = pd.read_json(str(filename_comments))

# Rename comment_body column and comment_id column for compatibility with posts
df_comments.rename(columns={"comment_body": "og_text"}, inplace=True)
df_comments.rename(columns={"comment_id": "document_id"}, inplace=True)

In [None]:
# Extract common columns and concatenate dfs
common_cols = df_posts.columns.intersection(df_comments.columns)
print(f"Common columns: {common_cols}")

df_merged = pd.concat([df_posts[common_cols], df_comments[common_cols]], ignore_index=True)

Common columns: Index(['subreddit', 'document_id', 'filtered_pos', 'og_text'], dtype='object')


In [None]:
# Extract document IDs and filter by them
giant_dd_idx = g_dd.vs['name']
docs_in_giant_dd = df_merged.loc[giant_dd_idx, "document_id"]
filtered_df = df_merged[df_merged["document_id"].isin(docs_in_giant_dd)]

In [None]:
# Save to JSON
# df_merged.to_json(DATA_DIR / 'docs_dd.json', orient='records', lines=True)
# filtered_df.to_json(DATA_DIR / 'docs_dd_giant.json', orient='records', lines=True)

## Louvain
Run Louvain on the Pdd giant component.

## BERTopic
Run BERTopic on the Pdd giant component.

In [6]:
# Prepare documents
tqdm.pandas()
df_dd['clean_text'] = df_dd['og_text'].progress_apply(prep.bert_preprocess)
documents = list(df_dd['clean_text'])

100%|██████████| 15819/15819 [00:15<00:00, 1021.75it/s]


In [None]:
# Model parameters
# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# bert_model = BERTopic(embedding_model=sentence_model, min_topic_size=50, nr_topics='auto')

In [None]:
# Model fit
# topics, probabilities = bert_model.fit_transform(documents)
# topics = bert_model.reduce_outliers(documents, topics)

In [None]:
# Save model
# filepath_bert = RESULTS_MODELS_DIR / 'bert'
# embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
# bert_model.save(str(filepath_bert), serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)


In [7]:
# Load model
filepath_bert = RESULTS_MODELS_DIR / 'bert'
bert_model = BERTopic.load(str(filepath_bert))

# Load topics
with open(filepath_bert / 'topics.json') as f:
    data_bert = json.load(f)

topics = data_bert["topics"]
df_dd["bertopic_topic"] = topics

In [93]:
# Topic info
print(bert_model.get_topic_info())

    Topic  Count                                   Name  \
0      -1   8835             -1_trans_people_like_would   
1       0    723   0_thank_thanks_happy_congratulations   
2       1    697                  1_cis_men_trans_women   
3       2    625     2_estrogen_hrt_growth_progesterone   
4       3    620     3_democrats_vote_republicans_party   
5       4    398       4_christians_christian_god_jesus   
6       5    334               5_state_states_blue_safe   
7       6    318                6_wear_dress_cold_pants   
8       7    308            7_life_transition_feel_time   
9       8    282                8_parents_mom_dad_child   
10      9    226        9_people_trans_hate_transphobic   
11     10    225         10_japanese_name_white_culture   
12     11    225      11_puberty_blockers_kids_children   
13     12    167       12_like_friend_relationship_talk   
14     13    144           13_neutral_guys_bro_gendered   
15     14    136  14_disney_characters_character_better 

In [8]:
# Get representative documents for each topic
doc_vectors = bert_model.vectorizer_model.transform(df_dd['clean_text'])
topic_vectors = bert_model.c_tf_idf_
similarity_matrix = cosine_similarity(doc_vectors, topic_vectors)

n_representatives = 5
representative_docs_idx = {}

topics = df_dd['bertopic_topic'].values
df_dd['repr_doc_rank'] = np.nan

for topic in np.unique(topics):
    idxs = np.where(topics == topic)[0]
    topic_similarities = similarity_matrix[idxs, topic]
    sorted_idx = idxs[np.argsort(topic_similarities)[::-1]]
    for rank, doc_idx in enumerate(sorted_idx, start=1):
        df_dd.at[doc_idx, 'repr_doc_rank'] = rank
    top_n = sorted_idx[:n_representatives]
    representative_docs_idx[topic] = top_n

In [15]:
bert_model.visualize_barchart(top_n_topics=32, title='Topic Word Scores', width=250, height=250, autoscale=True)

In [None]:
# Fine-tune representations
top_docs_per_topic = {}

for topic in df_dd['bertopic_topic'].unique():
    if topic == -1:
        continue  # skip outliers
    top_docs = df_dd[df_dd['bertopic_topic'] == topic].sort_values('repr_doc_rank').head(5)['og_text']
    top_docs_per_topic[topic] = top_docs.tolist()

# Store the new labels
topic_labels = {
    '0': 'Transgender Identity and Affirmation',
    '1': '',
    '2': '',
    '3': '',
    '4': '',
    '5': '',
    '6': '',
    '7': '',
    '8': '',
    '9': '',
    '10': '',
    '11': '',
    '12': '',
    '13': '',
    '14': '',
    '15': '',
    '16': '',
    '17': '',
    '18': '',
    '19': '',
    '20': '',
    '21': '',
    '22': '',
    '23': '',
    '24': '',
    '25': '',
    '26': '',
    '27': '',
    '28': '',
    '29': '',
    '30': '',
    '31': '',
    '32': '',
}


In [23]:
top_docs_per_topic[0]

['Honestly this well said and really hits hard, although, I’m not sure how many people would read through this all the way. If it’s an essay and how it feels to be trans and the challenges we face, excellently done! If it’s a coming out post, direct and concise would be best. This would be too long, I’m afraid, so I would recommend making it a paragraph or two. Maybe come out first and then say what you need to say? Then that would encourage more people to read through the whole thing, cuz it gives them the hook first, and then they’re like, “wait, what? Why?” And so they keep reading. Just my recommendation is all, lovely piece, it was an intriguing and educational read, thank you',
 'Congratulations! 😁 I\'ve still never had this happen to me yet, but I am WAITING for it! I think of it as, like, a rite of passage. I imagine it happening, and, in my head fantasy, I just get so giddy, and happy, and excited, and I\'m just totally open about it, and I\'m like, "NIIII!! It finally HAPPENE

## Clear Allocated Memory

In [24]:
# Run before exiting the program to clear memory
%reset -f
import gc
gc.collect()

0