In [26]:
import sys
sys.path.append('../src')

from graph_model import load_topic_frequencies, keep_top_n_topics, create_user_user_graph, connect_on_IOU
from analyze_graphs import configuration_model, modularity_communities, top_down_communities, extract_topics_from_community
from analyze_graphs import community_topic_evolution, sample_topics, compute_betweenness_graph, compute_community_betweenness, determine_prototype
from utils import plot_graph, get_literal_topics, create_topic_map, load_graph, save_graph

In [27]:
# Define data paths
# Long
# user_topic_graph_path = "../data/processed/author_topic_long.txt"
# user_topic_graph_reduced_path = "../data/processed/author_topic_reduced_long.txt"
# topic_freqs_path = "../data/processed/topic_freq_long.txt"
# user_user_graph_path = "../data/processed/user_user_long.txt"
# topics_path = "../data/processed/topics_long.txt"

# Small
# user_topic_graph_path = "../data/processed/author_topic_small.txt"
# user_topic_graph_reduced_path = "../data/processed/author_topic_reduced_small.txt"
# topic_freqs_path = "../data/processed/topic_freq_small.txt"
# user_user_graph_path = "../data/processed/user_user_small.txt"
# topics_path = "../data/processed/topics_small.txt"

# Medium
user_topic_graph_path = "../data/processed/author_topic_medium.txt"
user_topic_graph_reduced_path = "../data/processed/author_topic_reduced_medium.txt"
topic_freqs_path = "../data/processed/topic_freq_medium.txt"
user_user_graph_path = "../data/processed/user_user_medium.txt"
topics_path = "../data/processed/topics_medium.txt"

## Create user-topic and user-user graphs

In [28]:
# Create user-topic graph
user_topic_graph = load_graph(user_topic_graph_path)
topic_freqs = load_topic_frequencies(topic_freqs_path)
# Keep only top n topics in graph
user_topic_graph = keep_top_n_topics(user_topic_graph, topic_freqs, n=200)
save_graph(user_topic_graph, user_topic_graph_reduced_path)

# Load user-topic graph
# user_topic_graph = load_graph(user_topic_graph_reduced_path)

Number of nodes: 74902
Number of edges: 1208000


In [29]:
# Create use-user graph
user_user_graph = create_user_user_graph(user_topic_graph, connect_on_IOU, out_filename=user_user_graph_path)
# user_user_graph = load_graph(user_user_graph_path)

Number of nodes: 1000
Number of edges: 250268


In [None]:
# Draw crude graph
plot_graph(user_user_graph)

In [None]:
# Load configuration model graph
config_user_user_graph = configuration_model(user_user_graph)

In [None]:
# Draw crude config graph
plot_graph(config_user_user_graph)

### Detect communities in user-user graph

In [30]:
# Compute modularity-maximizing communities for user-user graph
mod_communities = modularity_communities(user_user_graph)
# Remove communities smaller than n
community_size_thresh = 2
mod_communities = list(filter(lambda c: len(c) > community_size_thresh, mod_communities))
num_communities = len(mod_communities)
print("{} communities".format(num_communities))
print("Community sizes: {}".format(list(map(lambda c: len(c), mod_communities))))

3 communities
Community sizes: [455, 373, 3]


In [None]:
# Plot communities
plot_graph(user_user_graph, mod_communities)

In [None]:
# Compute modularity-maximizing communities for config graph
config_mod_communities = modularity_communities(config_user_user_graph)
config_num_communities = len(config_mod_communities)
print("{} communities".format(config_num_communities))

### Analyze Modularity Communities

In [31]:
# Measure betweenness of each community
graph_betweenness = compute_betweenness_graph(user_user_graph, mod_communities)

Samples per community: 166
{1, 2, 8, 9, 11, 12, 17, 25, 27, 29, 33, 37, 39, 40, 41, 46, 48, 49, 50, 54, 56, 66, 67, 72, 73, 76, 77, 80, 84, 93, 94, 95, 97, 100, 101, 105, 107, 108, 110, 115, 117, 119, 123, 125, 130, 131, 135, 136, 139, 141, 147, 151, 159, 160, 161, 164, 171, 172, 176, 177, 178, 180, 183, 186, 192, 208, 209, 211, 217, 218, 220, 221, 225, 226, 228, 231, 234, 236, 237, 241, 248, 250, 252, 255, 258, 260, 263, 265, 273, 275, 278, 279, 282, 287, 289, 294, 296, 297, 298, 301, 302, 303, 306, 307, 310, 312, 314, 315, 318, 319, 324, 325, 326, 329, 331, 337, 338, 342, 345, 346, 347, 350, 351, 355, 356, 359, 361, 365, 366, 367, 368, 373, 374, 376, 380, 381, 383, 384, 386, 389, 391, 395, 398, 400, 409, 411, 417, 421, 422, 433, 434, 437, 440, 441, 443, 445, 450, 452, 453, 457, 458, 473, 474, 475, 476, 477, 479, 483, 489, 490, 492, 496, 497, 500, 501, 503, 507, 509, 515, 517, 518, 522, 524, 529, 532, 535, 538, 543, 546, 548, 549, 550, 551, 558, 564, 569, 572, 575, 578, 581, 585, 586,

In [32]:
# Create topic map
topic_map = create_topic_map(topics_path)

In [33]:
for community in mod_communities:
    # Compute betweenness of community
    community_betweenness = compute_community_betweenness(graph_betweenness, community)
    print("Community Betweenness: {}".format(community_betweenness))
    # Extract top topics
    topic_scores = extract_topics_from_community(user_topic_graph, community)
    top_topic_nodes = sample_topics(topic_scores, n=20)
    top_topics = get_literal_topics(top_topic_nodes, topic_map)
    # Find prototype and get its topics
    prototype = determine_prototype(user_user_graph, community)
    if prototype is not None:
        prototype_topics = user_topic_graph.neighbors(prototype)
        prototype_literals = get_literal_topics(prototype_topics, topic_map)
    n = 10
    print("Prototype has {} topics: {}".format(n, prototype_literals[:n]))
    print("Top 10 topics: {}".format(top_topics))

Community Betweenness: 13.422016009902322
Prototype has 10 topics: ["('right', '+')", "('income', '+')", "('difference', '+')", "('tax', '+')", "('time', '+')", "('everyone', '+')", "('business', '+')", "('money', '+')", "('fact', '+')", "('wage', '+')"]
Top 10 topics: ["('amendment', '-')", "('amendment', '+')", "('decision', '+')", "('speech', '+')", "('reddit', '+')", "('candidate', '+')", "('bush', '+')", "('wealth', '+')", "('murder', '-')", "('plan', '-')", "('voting', '+')", "('healthcare', '+')", "('labor', '+')", "('please', '+')", "('process', '-')", "('claim', '-')", "('process', '+')", "('population', '-')", "('increase', '+')", "('constitution', '+')"]
Community Betweenness: 23.10831996436017
Prototype has 10 topics: ["('country', '+')", "('look', '+')", "('problem', '-')", "('home', '-')", "('world', '-')", "('use', '-')", "('look', '-')", "('office', '-')", "('reason', '+')", "('today', '+')"]
Top 10 topics: ["('thank', '+')", "('nobody', '-')", "('voter', '-')", "('plea

### Analyze communities over time

In [37]:
# Compute community levels
community_levels = top_down_communities(user_user_graph, num_communities)
print("Num levels: {}".format(len(community_levels)))

Num levels: 0


In [38]:
# Get community level evolution
evolution = community_topic_evolution(community_levels, user_topic_graph, sample_n=None)

In [39]:
# Report evolution
for i, level in enumerate(evolution):
    print("Level {}:".format(i+1))
    for j, community_topic_scores in enumerate(level):
        community_topics = sample_topics(community_topic_scores, n=10) 
        community_topics = get_literal_topics(community_topics, topic_map)
        print("\tTopics for community {}: {}".format(j+1, community_topics))