# Topic Detection

## Description
Perform topic detection.

## Setup

### Libraries (Pyhton)

In [1]:
import community as community_louvain
import igraph as ig
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from pathlib import Path
import sys

### Directories

In [8]:
try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    BASE_DIR = Path().resolve()

SRC_DIR = BASE_DIR / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

DATA_DIR = BASE_DIR / 'data'
DATA_DIR.mkdir(exist_ok=True)
POSTS_DIR = DATA_DIR / 'posts'
POSTS_ALL_DIR = POSTS_DIR / 'all'
POSTS_FILTERED_DIR = POSTS_DIR / 'filtered'
POSTS_FILTERED_CLEAN_DIR = POSTS_FILTERED_DIR / 'clean'
COMMENTS_DIR = DATA_DIR / 'comments'
COMMENTS_CLEAN_DIR = COMMENTS_DIR / 'clean'
RESULTS_DIR = BASE_DIR / 'results'
RESULTS_GRAPHS_DIR = RESULTS_DIR / 'graphs'

### Libraries (Custom)

## Import Data

In [10]:
filename_df = DATA_DIR / 'data.json'
df = pd.read_json(str(filename_df), lines=True)

In [12]:
df.head()

Unnamed: 0,document_id,root_post_id,document_type,subreddit,filtered_pos
0,1h7okhk,1h7okhk,post,asktransgender,"[this PRON, normal ADJ, this PRON, i PRON, nor..."
1,1ht5ang,1ht5ang,post,asktransgender,"[i PRON, just ADV, find VERB, i PRON, girl NOU..."
2,1haqyqt,1haqyqt,post,asktransgender,"[trans PROPN, girl NOUN, who PRON, come VERB, ..."
3,1hong9g,1hong9g,post,asktransgender,"[my PRON, partner NOUN, get VERB, want VERB, g..."
4,1hhl0q7,1hhl0q7,post,asktransgender,"[so ADV, i PRON, find VERB, my PRON, trans PRO..."


In [4]:
filename = RESULTS_GRAPHS_DIR / 'g_dd.graphml'
g_dd_nx = nx.read_graphml(str(filename))
g_dd = ig.Graph.from_networkx(g_dd_nx)

print(g_dd.summary())

IGRAPH UNW- 15819 924974 -- 
+ attr: edge_default (g), node_default (g), _nx_name (v), name (v), weight (e)


In [5]:
# Check basic stats
print("--- Document-Projected Network (Giant Component) ---")
print(f"Size: {g_dd.vcount()}")
print(f"Average degree: {np.mean(g_dd.degree()):.2f}")
print(f"Median degree: {np.median(g_dd.degree()):.2f}")
print(f"Density: {g_dd.density():.5f}")
print(f"Diameter: {g_dd.diameter(directed=False, unconn=False):.2f}")
print(f"Average path length: {g_dd.average_path_length():.5f}")

--- Document-Projected Network (Giant Component) ---
Size: 15819
Average degree: 116.94
Median degree: 50.00
Density: 0.00739
Diameter: 6.00
Average path length: 2.51146


### Retrieve Original Text
Retrieve the original non-processed text from the documents in the network (giant component of Pdd).

In [None]:
# Get node names in a list
# Get df indices in a list
# Select Reddit id (column 'document_id') from documents in df whose index is in the node names list
# Load "all posts clean" and "all comments clean"
# In "all posts clean", create a "og_text" column containing the text in "title" and "selftext", concatenated.
# In "all comments clean", create a "og_text" column containing the text in "comment_body"
# Merge the two dfs so that posts and comments are in the same dataframe: be careful, do not discared "document_id"
# Add and "og_text" column to df, based on "document_id"


## Louvain
Run Louvain on the largest connected component.

In [None]:
# Run Louvain
# partition_lcc = community_louvain.best_partition(g_dd, weight='weight')
# print(f"Louvain (LCC) found {len(set(partition_lcc.values()))} communities.")

## Clear Allocated Memory

In [14]:
# Run before exiting the program to clear memory
%reset -f
import gc
gc.collect()

0