# Topic Detection

## Description
Perform topic detection.

## Setup

### Libraries (Pyhton)

In [153]:
from sklearn.feature_extraction.text import CountVectorizer
import copy
import igraph as ig
import json
import networkx as nx
import numpy as np
import os
import pandas as pd
import pickle
from pathlib import Path
import scipy.sparse as sps
import sys
from tqdm import tqdm
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    import louvain

# For development only
from importlib import reload

### Directories

In [4]:
try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    BASE_DIR = Path().resolve()

SRC_DIR = BASE_DIR / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

DATA_DIR = BASE_DIR / 'data'
DATA_DIR.mkdir(exist_ok=True)
POSTS_DIR = DATA_DIR / 'posts'
POSTS_ALL_DIR = POSTS_DIR / 'all'
POSTS_FILTERED_DIR = POSTS_DIR / 'filtered'
POSTS_FILTERED_CLEAN_DIR = POSTS_FILTERED_DIR / 'clean'
COMMENTS_DIR = DATA_DIR / 'comments'
COMMENTS_CLEAN_DIR = COMMENTS_DIR / 'clean'
RESULTS_DIR = BASE_DIR / 'results'
RESULTS_GRAPHS_DIR = RESULTS_DIR / 'graphs'
RESULTS_MODELS_DIR = RESULTS_DIR / 'models'
RESULTS_MODELS_DIR.mkdir(exist_ok=True)
RESULTS_MODELS_FILE = RESULTS_MODELS_DIR / 'model_results.csv'

### Libraries (Custom)

In [None]:
import preprocess as prep
# import topic as t
import metrics as met

In [23]:
reload(met)

<module 'metrics' from 'C:\\Backup\\000_DS\\courses\\00-network-science\\ns-project\\src\\metrics.py'>

## Import Data

In [6]:
# Graph
filename = RESULTS_GRAPHS_DIR / 'g_dd.graphml'
g_dd_nx = nx.read_graphml(str(filename))
g_dd = ig.Graph.from_networkx(g_dd_nx)

print(g_dd.summary())

IGRAPH UNW- 15819 924974 -- 
+ attr: edge_default (g), node_default (g), _nx_name (v), name (v), weight (e)


In [7]:
# Matrices
filename_pickle_gdd = RESULTS_GRAPHS_DIR / 'g_dd.pickle'
with open(filename_pickle_gdd , 'rb') as f:
    g_dd_matrices = pickle.load(f)

In [8]:
# Original documents
filename_df = DATA_DIR / 'docs_dd_giant.json'
df_dd = pd.read_json(str(filename_df), lines=True)

In [5]:
# Check basic stats
print("--- Document-Projected Network (Giant Component) ---")
print(f"Size: {g_dd.vcount()}")
print(f"Average degree: {np.mean(g_dd.degree()):.2f}")
print(f"Median degree: {np.median(g_dd.degree()):.2f}")
print(f"Density: {g_dd.density():.5f}")
print(f"Diameter: {g_dd.diameter(directed=False, unconn=False):.2f}")
print(f"Average path length: {g_dd.average_path_length():.5f}")

--- Document-Projected Network (Giant Component) ---
Size: 15819
Average degree: 116.94
Median degree: 50.00
Density: 0.00739
Diameter: 6.00
Average path length: 2.51146


## Louvain

In [88]:
# Partition
partition_louvain = louvain.find_partition(g_dd, louvain.ModularityVertexPartition, weights='weight', seed=42)
print(f"Number of communities: {len(partition_louvain)}")

# Access community assignments
membership_louvain = partition_louvain.membership
topics = np.array(membership_louvain)

Number of communities: 13


## Metrics
Modularity (non-overlapping and overlapping), NMI, Ncut.

In [155]:
# Normalize existing matrices
Pwd = g_dd_matrices['Pwd'] / g_dd_matrices['Pwd'].sum()
Pdd = g_dd_matrices['Pdd'] / g_dd_matrices['Pdd'].sum()

In [156]:
# Build C
C = met.build_C(topics, with_outliers=1)

  self._set_arrayXarray(i, j, x)


In [157]:
C = C[:,np.unique(sps.find(C)[1])]

In [160]:
# Update topic matrices
Pwc = Pwd.dot(C) # Joint word + class probability
Pcc = ((C.T).dot(Pdd)).dot(C) # Joint class + class probability

In [161]:
# Initialize results dict
model_name = 'Louvain'
metric_names = ['Modularity', 'Ncut', 'NMI']
metrics = {model_name: {metric: np.nan for metric in metric_names}}

In [162]:
# Compute metrics
metrics[model_name]['Modularity'] = met.calculate_modularity(Pcc)
metrics[model_name]['Ncut'] = met.calculate_ncut(Pcc)
metrics[model_name]['NMI'] = met.calculate_nmi(Pwc)

# Print results
print(f"NMI: {metrics[model_name]['NMI']:.4f}")
print(f"Modularity: {metrics[model_name]['Modularity']:.4f}")
print(f"Ncut: {metrics[model_name]['Ncut']:.4f}")

met.save_results_to_csv(RESULTS_MODELS_FILE, metrics, metric_names, model_name)
print("Results saved to .csv")

NMI: 0.1193
Modularity: 0.2704
Ncut: 0.6119
Results saved to .csv


In [146]:
from collections import Counter
Counter(partition_louvain.membership)

Counter({0: 4529,
         1: 3176,
         2: 2563,
         3: 1147,
         4: 1116,
         5: 899,
         6: 746,
         7: 677,
         8: 616,
         9: 176,
         10: 94,
         11: 75,
         12: 5})

## Clear Allocated Memory

In [7]:
# Run before exiting the program to clear memory
%reset -f
import gc
gc.collect()

0