In [1]:
import pickle
import networkx as nx
import pandas as pd
import random

In [2]:
def add_nodes_from_user_data(G, user_data):
    """
    Add nodes to the graph from user data.

    Parameters:
    - G (networkx.Graph): The graph to add nodes to.
    - user_data (pandas.DataFrame): The user data containing the 'Id' column.

    Returns:
    None
    """
    for user_id in user_data['Id']:
        G.add_node(user_id)

In [3]:
def add_edges_from_post_data(G, posts_data):
    """
    Add edges to a graph based on post data.

    Parameters:
    - G (networkx.Graph): The graph to add edges to.
    - posts_data (pandas.DataFrame): The post data containing information about posts.

    Returns:
    None
    """
    # Filter to only answers and drop NaNs
    answers_data = posts_data[posts_data['PostTypeId'] == 2].dropna(subset=['ParentId', 'OwnerUserId'])

    # Create a lookup table for question askers
    question_askers = posts_data[posts_data['PostTypeId'] == 1].set_index('Id')['OwnerUserId']

    # Iterate through answers
    for answerer_id, question_id in zip(answers_data['OwnerUserId'], answers_data['ParentId']):
        asker_id = question_askers.get(question_id)
        if asker_id is not None:
            G.add_edge(answerer_id, asker_id)

In [4]:
def add_edges_from_comment_data(G, comments_data, posts_data):
    """
    Add edges to the graph `G` based on the comment data.

    Parameters:
    - G (networkx.Graph): The graph to add edges to.
    - comments_data (pandas.DataFrame): The comment data containing 'UserId' and 'PostId' columns.
    - posts_data (pandas.DataFrame): The post data containing 'Id' and 'OwnerUserId' columns.

    Returns:
    None
    """
    # Drop NaNs
    comments_data = comments_data.dropna(subset=['PostId', 'UserId'])

    # Create a lookup table for post owners
    post_owners = posts_data.set_index('Id')['OwnerUserId']

    # Iterate through comments
    for commenter_id, post_id in zip(comments_data['UserId'], comments_data['PostId']):
        post_owner_id = post_owners.get(post_id)
        if post_owner_id is not None:
            G.add_edge(commenter_id, post_owner_id)

In [5]:
# Create the network
G = nx.DiGraph()

# Add nodes (users)
with open('../data/raw/users_typecasted.pkl', 'rb') as file:
    user_data = pickle.load(file)
# Add nodes to the graph
add_nodes_from_user_data(G, user_data)

# Print number of nodes
print('Number of nodes:', len(G.nodes()))

Number of nodes: 1116805


In [6]:
# Add edges from post (questions and answers) interactions

with open('../data/raw/posts_typecasted_1.pkl', 'rb') as file:
    posts_data_1 = pickle.load(file)
with open('../data/raw/posts_typecasted_2.pkl', 'rb') as file:
    posts_data_2 = pickle.load(file)
with open('../data/raw/posts_typecasted_3.pkl', 'rb') as file:
    posts_data_3 = pickle.load(file)
with open('../data/raw/posts_typecasted_4.pkl', 'rb') as file:
    posts_data_4 = pickle.load(file)

# Combine all post data
posts_data = pd.concat([posts_data_1, posts_data_2, posts_data_3, posts_data_4], ignore_index=True)
# Add edges to the graph
add_edges_from_post_data(G, posts_data)

# Print number of edges
print('Number of edges:', len(G.edges()))

Number of edges: 987196


In [7]:
# Add edges from comment interactions

with open('../data/raw/comments_typecasted_1.pkl', 'rb') as file:
    comments_data_1 = pickle.load(file)
with open('../data/raw/comments_typecasted_2.pkl', 'rb') as file:
    comments_data_2 = pickle.load(file)
with open('../data/raw/comments_typecasted_3.pkl', 'rb') as file:
    comments_data_3 = pickle.load(file)
with open('../data/raw/comments_typecasted_4.pkl', 'rb') as file:
    comments_data_4 = pickle.load(file)
with open('../data/raw/comments_typecasted_5.pkl', 'rb') as file:
    comments_data_5 = pickle.load(file)

# Combine all comment data
comments_data = pd.concat([comments_data_1, comments_data_2, comments_data_3, comments_data_4, comments_data_5], ignore_index=True)
# Add edges to the graph
add_edges_from_comment_data(G, comments_data, posts_data)

# Print number of edges
print('Number of edges:', len(G.edges()))

Number of edges: 4363589


In [9]:
# Save the network
nx.write_gexf(G, '../data/processed/network/network.gexf')