# Topic Detection

## Description
Perform topic detection.

## Setup

### Libraries (Pyhton)

In [2]:
import community as community_louvain
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from pathlib import Path
import sys

### Directories

In [3]:
try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    BASE_DIR = Path().resolve()

SRC_DIR = BASE_DIR / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

DATA_DIR = BASE_DIR / 'data'
DATA_DIR.mkdir(exist_ok=True)
POSTS_DIR = DATA_DIR / 'posts'
POSTS_ALL_DIR = POSTS_DIR / 'all'
POSTS_FILTERED_DIR = POSTS_DIR / 'filtered'
POSTS_FILTERED_CLEAN_DIR = POSTS_FILTERED_DIR / 'clean'
COMMENTS_DIR = DATA_DIR / 'comments'
COMMENTS_CLEAN_DIR = COMMENTS_DIR / 'clean'
RESULTS_DIR = BASE_DIR / 'results'
RESULTS_GRAPHS_DIR = RESULTS_DIR / 'graphs'

### Libraries (Custom)

## Import Data

In [5]:
filename = RESULTS_GRAPHS_DIR / 'g_proj.gexf'
G = nx.read_gexf(filename)

In [None]:
def graph_info(G):
    return {
        "type": type(G).__name__,
        "directed": G.is_directed(),
        "nodes": G.number_of_nodes(),
        "edges": G.number_of_edges(),
        "density": nx.density(G),
        "selfloops": nx.number_of_selfloops(G),
    }

print(graph_info(G))

{'type': 'Graph', 'directed': False, 'nodes': 6247, 'edges': 131415, 'density': 0.006735990239772343, 'selfloops': 0}


## Louvain
Run Louvain on the largest connected component.

In [None]:
# Extract largest connected component (LCC)
largest_cc_nodes = max(nx.connected_components(G_merged_word), key=len)
G_lcc = G_merged_word.subgraph(largest_cc_nodes).copy()

# Run Louvain
partition_lcc = community_louvain.best_partition(G_lcc, weight='weight')
print(f"Louvain (LCC) found {len(set(partition_lcc.values()))} communities.")