# Topic Detection

## Description
Perform topic detection.

## Setup

### Libraries (Pyhton)

In [1]:
from bertopic import BERTopic
import community as community_louvain
import igraph as ig
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from pathlib import Path
import sys

### Directories

In [2]:
try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    BASE_DIR = Path().resolve()

SRC_DIR = BASE_DIR / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

DATA_DIR = BASE_DIR / 'data'
DATA_DIR.mkdir(exist_ok=True)
POSTS_DIR = DATA_DIR / 'posts'
POSTS_ALL_DIR = POSTS_DIR / 'all'
POSTS_FILTERED_DIR = POSTS_DIR / 'filtered'
POSTS_FILTERED_CLEAN_DIR = POSTS_FILTERED_DIR / 'clean'
COMMENTS_DIR = DATA_DIR / 'comments'
COMMENTS_CLEAN_DIR = COMMENTS_DIR / 'clean'
RESULTS_DIR = BASE_DIR / 'results'
RESULTS_GRAPHS_DIR = RESULTS_DIR / 'graphs'

### Libraries (Custom)

## Import Data

In [None]:
# Graph
filename = RESULTS_GRAPHS_DIR / 'g_dd.graphml'
g_dd_nx = nx.read_graphml(str(filename))
g_dd = ig.Graph.from_networkx(g_dd_nx)

print(g_dd.summary())

IGRAPH UNW- 15819 924974 -- 
+ attr: edge_default (g), node_default (g), _nx_name (v), name (v), weight (e)


In [None]:
# Original documents
filename_df = DATA_DIR / 'docs_dd_giant.json'
df_dd = pd.read_json(str(filename_df), lines=True)

In [None]:
# Check basic stats
print("--- Document-Projected Network (Giant Component) ---")
print(f"Size: {g_dd.vcount()}")
print(f"Average degree: {np.mean(g_dd.degree()):.2f}")
print(f"Median degree: {np.median(g_dd.degree()):.2f}")
print(f"Density: {g_dd.density():.5f}")
print(f"Diameter: {g_dd.diameter(directed=False, unconn=False):.2f}")
print(f"Average path length: {g_dd.average_path_length():.5f}")

--- Document-Projected Network (Giant Component) ---
Size: 15819
Average degree: 116.94
Median degree: 50.00
Density: 0.00739
Diameter: 6.00
Average path length: 2.51146


### Retrieve Original Text [NO NEED TO RUN]
Retrieve the original non-processed text from the documents in the network (giant component of Pdd).

In [None]:
# --- Prep posts ---
filename_posts = POSTS_FILTERED_CLEAN_DIR / 'all_posts_clean.json'
df_posts = pd.read_json(str(filename_posts))

# Concantenate original test into a single column
df_posts["og_text"] = df_posts["title"].fillna('') + " " + df_posts["selftext"].fillna('')

# Rename document ID column
df_posts.rename(columns={"id": "document_id"}, inplace=True)

In [None]:
# --- Prep comments ---
filename_comments = COMMENTS_CLEAN_DIR / 'all_comments_clean.json'
df_comments = pd.read_json(str(filename_comments))

# Rename comment_body column and comment_id column for compatibility with posts
df_comments.rename(columns={"comment_body": "og_text"}, inplace=True)
df_comments.rename(columns={"comment_id": "document_id"}, inplace=True)

In [None]:
# Extract common columns and concatenate dfs
common_cols = df_posts.columns.intersection(df_comments.columns)
print(f"Common columns: {common_cols}")

df_merged = pd.concat([df_posts[common_cols], df_comments[common_cols]], ignore_index=True)

Common columns: Index(['subreddit', 'document_id', 'filtered_pos', 'og_text'], dtype='object')


In [None]:
# Extract document IDs and filter by them
giant_dd_idx = g_dd.vs['name']
docs_in_giant_dd = df_merged.loc[giant_dd_idx, "document_id"]
filtered_df = df_merged[df_merged["document_id"].isin(docs_in_giant_dd)]

In [None]:
# Save to JSON
# df_merged.to_json(DATA_DIR / 'docs_dd.json', orient='records', lines=True)
# filtered_df.to_json(DATA_DIR / 'docs_dd_giant.json', orient='records', lines=True)

## Louvain
Run Louvain on the Pdd giant component.

## BERTopic
Run BERTopic on the Pdd giant component.

In [8]:
documents = list(df_dd['og_text'])

In [11]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(documents)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
print(topic_model.get_topic_info())

     Topic  Count                                               Name  \
0       -1   7207                                   -1_to_the_and_of   
1        0    324                           0_state_states_blue_safe   
2        1    269                        1_democrats_vote_dems_party   
3        2    266                                2_she_her_shes_mace   
4        3    225                              3_her_friend_she_shes   
..     ...    ...                                                ...   
198    197     10  197_transphobia_transphobic_experience_underla...   
199    198     10          198_poorly_shocking_shocked_disappointing   
200    199     10               199_diy_chicks_easierless_consulting   
201    200     10                    200_representative_il_list_reps   
202    201     10             201_black_violent_addressing_community   

                                        Representation  \
0    [to, the, and, of, that, trans, they, you, peo...   
1    [state, states

In [34]:
topic_model.reduce_topics(documents, nr_topics=20)

<bertopic._bertopic.BERTopic at 0x2010a46a750>

In [35]:
print(topic_model.get_topic_info())

    Topic  Count                                        Name  \
0      -1   7207                            -1_to_the_and_of   
1       0   3408                             0_to_and_the_of   
2       1   1540                             1_the_to_of_and   
3       2    753                              2_my_and_it_to   
4       3    666                            3_it_is_this_the   
5       4    596                           4_you_your_to_and   
6       5    468                          5_in_state_the_and   
7       6    347                            6_name_it_the_to   
8       7    334                     7_the_christians_of_and   
9       8    175           8_ty_congrats_you_congratulations   
10      9     82                          9_hr_hrt_to_lawyer   
11     10     64            10_money_costco_corporations_the   
12     11     55                      11_cold_winter_warm_my   
13     12     31                      12_whats_your_jedi_you   
14     13     27  13_kilometerock_kilome

In [13]:
import re

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove mentions/hashtags
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)  # Remove special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

In [25]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions/hashtags
    text = re.sub(r'[^a-z0-9\s]+', '', text)  # Remove special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace

    # Remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    
    return text

In [26]:
preprocessed_docs = [preprocess(doc) for doc in documents]

In [39]:
topic_model_preprocessed = BERTopic()
topics_pre, probs_pre = topic_model_preprocessed.fit_transform(preprocessed_docs)

In [40]:
print(topic_model_preprocessed.get_topic_info())

     Topic  Count                                   Name  \
0       -1   8207                -1_trans_people_like_im   
1        0    426       0_christians_christian_jesus_god   
2        1    303               1_state_blue_states_safe   
3        2    292       2_puberty_blockers_kids_children   
4        3    236          3_japanese_name_white_culture   
..     ...    ...                                    ...   
188    187     10  187_school_trouble_backing_guidelines   
189    188     10      188_attracted_preference_grasp_bi   
190    189     10          189_fear_mongering_uh_cavemen   
191    190     10         190_democrats_bill_mustpass_81   
192    191     10                191_ban_bans_mainly_218   

                                        Representation  \
0    [trans, people, like, im, dont, know, us, cis,...   
1    [christians, christian, jesus, god, bible, rel...   
2    [state, blue, states, safe, red, california, c...   
3    [puberty, blockers, kids, children, child,

In [41]:
topic_model_preprocessed.reduce_topics(preprocessed_docs, nr_topics=100)

<bertopic._bertopic.BERTopic at 0x20101da6490>

In [42]:
print(topic_model_preprocessed.get_topic_info())

    Topic  Count                               Name  \
0      -1   8207            -1_trans_people_like_im   
1       0   1009             0_trans_people_cis_men   
2       1    426   1_christians_christian_god_jesus   
3       2    366  2_democrats_vote_republicans_dems   
4       3    315           3_state_states_blue_safe   
..    ...    ...                                ...   
95     94     12        94_tv_glowing_blurry_behind   
96     95     12    95_mold_rooting_mushrooms_nasty   
97     96     12               96_foot_six_dig_hole   
98     97     10            97_fear_mongering_uh_us   
99     98     10             98_ban_bans_mainly_218   

                                       Representation  \
0   [trans, people, like, im, dont, know, us, woul...   
1   [trans, people, cis, men, woman, straight, wom...   
2   [christians, christian, god, jesus, bible, rel...   
3   [democrats, vote, republicans, dems, party, bi...   
4   [state, states, blue, safe, red, california, c... 

In [43]:
topic_model_preprocessed.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## Clear Allocated Memory

In [None]:
# Run before exiting the program to clear memory
%reset -f
import gc
gc.collect()

0