In [15]:
import sys
sys.path.append('../src')

from graph_model import load_topic_frequencies, keep_top_n_topics, create_user_user_graph, connect_on_IOU
from analyze_graphs import configuration_model, modularity_communities, top_down_communities, extract_topics_from_community
from analyze_graphs import community_topic_evolution, sample_topics, compute_betweenness_graph, compute_community_betweenness, determine_prototype
from utils import plot_graph, get_literal_topics, create_topic_map, load_graph, save_graph

In [16]:
# Define data paths
# Long
# user_topic_graph_path = "../data/processed/author_topic_long.txt"
# user_topic_graph_reduced_path = "../data/processed/author_topic_reduced_long.txt"
# topic_freqs_path = "../data/processed/topic_freq_long.txt"
# user_user_graph_path = "../data/processed/user_user_long.txt"
# topics_path = "../data/processed/topics_long.txt"

# Small
# user_topic_graph_path = "../data/processed/author_topic_small.txt"
# user_topic_graph_reduced_path = "../data/processed/author_topic_reduced_small.txt"
# topic_freqs_path = "../data/processed/topic_freq_small.txt"
# user_user_graph_path = "../data/processed/user_user_small.txt"
# topics_path = "../data/processed/topics_small.txt"

# Medium
user_topic_graph_path = "../data/processed/author_topic_medium.txt"
user_topic_graph_reduced_path = "../data/processed/author_topic_reduced_medium.txt"
topic_freqs_path = "../data/processed/topic_freq_medium.txt"
user_user_graph_path = "../data/processed/user_user_medium.txt"
topics_path = "../data/processed/topics_medium.txt"

## Create user-topic and user-user graphs

In [17]:
# Create user-topic graph
user_topic_graph = load_graph(user_topic_graph_path)
topic_freqs = load_topic_frequencies(topic_freqs_path)
# Keep only top n topics in graph
user_topic_graph = keep_top_n_topics(user_topic_graph, topic_freqs, n=200)
save_graph(user_topic_graph, user_topic_graph_reduced_path)

# Load user-topic graph
# user_topic_graph = load_graph(user_topic_graph_reduced_path)

Number of nodes: 74902
Number of edges: 1208000


In [18]:
# Create use-user graph
user_user_graph = create_user_user_graph(user_topic_graph, connect_on_IOU, out_filename=user_user_graph_path)
# user_user_graph = load_graph(user_user_graph_path)

Number of nodes: 1000
Number of edges: 340079


In [None]:
# Draw crude graph
plot_graph(user_user_graph)

In [None]:
# Load configuration model graph
config_user_user_graph = configuration_model(user_user_graph)

In [None]:
# Draw crude config graph
plot_graph(config_user_user_graph)

### Detect communities in user-user graph

In [19]:
# Compute modularity-maximizing communities for user-user graph
mod_communities = modularity_communities(user_user_graph)
# Remove communities smaller than n
community_size_thresh = 2
mod_communities = list(filter(lambda c: len(c) > community_size_thresh, mod_communities))
num_communities = len(mod_communities)
print("{} communities".format(num_communities))
print("Community sizes: {}".format(list(map(lambda c: len(c), mod_communities))))

2 communities
Community sizes: [543, 375]


In [None]:
# Plot communities
plot_graph(user_user_graph, mod_communities)

In [None]:
# Compute modularity-maximizing communities for config graph
config_mod_communities = modularity_communities(config_user_user_graph)
config_num_communities = len(config_mod_communities)
print("{} communities".format(config_num_communities))

### Analyze Modularity Communities

In [20]:
# Measure betweenness of each community
graph_betweenness = compute_betweenness_graph(user_user_graph, mod_communities)

Samples per community: 250
{1, 5, 6, 7, 8, 12, 16, 21, 22, 23, 25, 26, 27, 28, 33, 34, 36, 38, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 54, 57, 60, 65, 66, 69, 70, 71, 72, 73, 80, 81, 83, 87, 88, 89, 90, 91, 92, 94, 96, 99, 100, 101, 102, 105, 107, 108, 109, 110, 118, 123, 128, 131, 132, 133, 138, 139, 141, 142, 144, 146, 147, 148, 150, 151, 152, 154, 157, 159, 160, 161, 163, 164, 165, 166, 169, 170, 171, 172, 175, 179, 183, 186, 187, 188, 189, 190, 191, 194, 208, 209, 210, 212, 213, 214, 215, 218, 220, 221, 222, 225, 227, 228, 236, 237, 239, 240, 244, 245, 246, 248, 258, 259, 261, 264, 266, 267, 269, 271, 272, 275, 277, 278, 279, 280, 282, 283, 284, 285, 287, 289, 290, 292, 294, 295, 297, 300, 302, 303, 304, 306, 307, 308, 309, 311, 312, 314, 315, 316, 317, 319, 321, 323, 324, 327, 329, 330, 332, 336, 337, 341, 344, 345, 348, 351, 353, 354, 355, 359, 360, 361, 365, 368, 369, 370, 372, 373, 374, 375, 380, 384, 387, 388, 389, 390, 393, 394, 395, 396, 398, 401, 402, 407, 410, 412, 413

In [21]:
# Create topic map
topic_map = create_topic_map(topics_path)

In [None]:
for community in mod_communities:
    # Compute betweenness of community
    community_betweenness = compute_community_betweenness(graph_betweenness, community)
    print("Community Betweenness: {}".format(community_betweenness))
    # Extract top topics
    topic_scores = extract_topics_from_community(user_topic_graph, community)
    top_topic_nodes = sample_topics(topic_scores, n=20)
    top_topics = get_literal_topics(top_topic_nodes, topic_map)
    # Find prototype and get its topics
    prototype = determine_prototype(user_user_graph, community)
    if prototype is not None:
        prototype_topics = user_topic_graph.neighbors(prototype)
        prototype_literals = get_literal_topics(prototype_topics, topic_map)
    n = 10
    print("Prototype has {} topics: {}".format(n, prototype_literals[:n]))
    print("Top 10 topics: {}".format(top_topics))

Community Betweenness: 24.779351248161863
Prototype has 10 topics: ["('amount', '+')", "('income', '+')", "('help', '+')", "('support', '+')", "('lot', '+')", "('tax', '+')", "('everyone', '+')", "('law', '+')", "('country', '+')", "('problem', '+')"]
Top 10 topics: ["('please', '+')", "('statement', '+')", "('information', '+')", "('campaign', '+')", "('congress', '-')", "('court', '-')", "('freedom', '+')", "('wage', '-')", "('election', '-')", "('comment', '+')", "('market', '-')", "('bill', '-')", "('force', '-')", "('argument', '+')", "('evidence', '+')", "('society', '-')", "('source', '+')", "('insurance', '-')", "('education', '+')", "('interest', '+')"]
Community Betweenness: 37.066889162124816
Prototype has 10 topics: ["('government', '-')", "('problem', '-')", "('hell', '-')", "('bill', '-')", "('pay', '-')", "('anyone', '-')", "('law', '-')", "('world', '-')", "('business', '-')", "('health', '-')"]
Top 10 topics: ["('thank', '+')", "('feel', '+')", "('gop', '+')", "('amend

### Analyze communities over time

In [None]:
# Compute community levels
community_levels = top_down_communities(user_user_graph, num_communities)

In [None]:
# Get community level evolution
evolution = community_topic_evolution(community_levels, user_topic_graph, sample_n=None)

In [None]:
# Report evolution
for i, level in enumerate(evolution):
    print("Level {}:".format(i+1))
    for j, community_topic_scores in enumerate(level):
        community_topics = sample_topics(community_topic_scores, n=10) 
        community_topics = get_literal_topics(community_topics, topic_map)
        print("\tTopics for community {}: {}".format(j+1, community_topics))