In [26]:
import pickle
import networkx as nx
import pandas as pd

In [27]:
# def add_nodes_from_user_data(G, user_data):
#     for user_id in user_data['Id']:
#         G.add_node(user_id)

def add_nodes_from_user_data(G, user_data):
    """
    Add nodes to the graph from user data, including attributes.

    Parameters:
    - G (networkx.Graph): The graph to add nodes to.
    - user_data (pandas.DataFrame): The user data containing the 'Id' column and other attributes.

    Returns:
    None
    """
    for _, row in user_data.iterrows():
        node_attributes = row.to_dict()
        user_id = node_attributes.pop('Id')  # Remove the 'Id' as it will be the node identifier
        G.add_node(user_id, **node_attributes)


def preprocess_user_data(user_data, posts_data, comments_data, threshold):
    # Calculate the number of posts per user
    posts_count = posts_data['OwnerUserId'].value_counts()
    
    # Calculate the number of comments per user
    comments_count = comments_data['UserId'].value_counts()
    
    # Combine the counts, filling in zeros for users who haven't posted or commented
    combined_count = posts_count.add(comments_count, fill_value=0)
    
    # Filter users who meet the threshold
    active_users = combined_count[combined_count >= threshold].index
    
    # Filter the user_data DataFrame to include only active users
    active_user_data = user_data[user_data['Id'].isin(active_users)]
    
    return active_user_data


def preprocess_post_data(posts_data, active_user_data):
    # Filter posts to only those owned by active users
    active_posts_data = posts_data[posts_data['OwnerUserId'].isin(active_user_data['Id'])]
    
    return active_posts_data


def preprocess_comment_data(comments_data, active_user_data, active_posts_data):
    # Filter comments to only those owned by active users
    active_comments_data = comments_data[comments_data['UserId'].isin(active_user_data['Id'])]
    
    # Filter comments to only those on posts by active users
    active_comments_data = active_comments_data[active_comments_data['PostId'].isin(active_posts_data['Id'])]
    
    return active_comments_data


def add_edges_from_post_data(G, posts_data):
    # Filter to only answers and drop NaNs
    answers_data = posts_data[posts_data['PostTypeId'] == 2].dropna(subset=['ParentId', 'OwnerUserId'])

    # Create a lookup table for question askers
    question_askers = posts_data[posts_data['PostTypeId'] == 1].set_index('Id')['OwnerUserId']

    # Iterate through answers
    for answerer_id, question_id in zip(answers_data['OwnerUserId'], answers_data['ParentId']):
        asker_id = question_askers.get(question_id)
        # Check that answerer and asker are not the same
        if asker_id is not None and answerer_id != asker_id:
            G.add_edge(answerer_id, asker_id)


def add_edges_from_comment_data(G, comments_data, posts_data):
    # Drop NaNs
    comments_data = comments_data.dropna(subset=['PostId', 'UserId'])

    # Create a lookup table for post owners
    post_owners = posts_data.set_index('Id')['OwnerUserId']

    # Iterate through comments
    for commenter_id, post_id in zip(comments_data['UserId'], comments_data['PostId']):
        post_owner_id = post_owners.get(post_id)
        # Check that commenter and post owner are not the same
        if post_owner_id is not None and commenter_id != post_owner_id:
            G.add_edge(commenter_id, post_owner_id)


# Function to convert all Timestamps to strings in the graph data
def convert_timestamps_to_strings(G):
    for node, data in G.nodes(data=True):
        for key, value in data.items():
            if isinstance(value, pd.Timestamp):
                # Convert Timestamp to string
                G.nodes[node][key] = value.strftime('%Y-%m-%d %H:%M:%S')

    for u, v, data in G.edges(data=True):
        for key, value in data.items():
            if isinstance(value, pd.Timestamp):
                # Convert Timestamp to string
                G.edges[u, v][key] = value.strftime('%Y-%m-%d %H:%M:%S')


In [28]:
# Create the network
G = nx.DiGraph()

# Load user data
with open('../data/raw/active_users_with_sentiment.pkl', 'rb') as file:
    user_data = pickle.load(file)

# # Print user data columns
print('User data columns:', user_data.columns)

User data columns: Index(['Id', 'Reputation', 'CreationDate', 'LastAccessDate', 'Views',
       'UpVotes', 'DownVotes', 'PostCount', 'CommentCount',
       'AcceptedAnswerCount', 'AnswerCount', 'TotalActivity', 'AvgAnswerScore',
       'AvgPostScore', 'AcceptedAnswerFraction', 'AnswerSentiment'],
      dtype='object')


In [29]:
# Add edges from post (questions and answers) interactions

with open('../data/raw/posts_typecasted_1.pkl', 'rb') as file:
    posts_data_1 = pickle.load(file)
with open('../data/raw/posts_typecasted_2.pkl', 'rb') as file:
    posts_data_2 = pickle.load(file)
with open('../data/raw/posts_typecasted_3.pkl', 'rb') as file:
    posts_data_3 = pickle.load(file)
with open('../data/raw/posts_typecasted_4.pkl', 'rb') as file:
    posts_data_4 = pickle.load(file)

# Combine all post data
posts_data = pd.concat([posts_data_1, posts_data_2, posts_data_3, posts_data_4], ignore_index=True)

# Print post data columns
print('Post data columns:', posts_data.columns)

Post data columns: Index(['Id', 'PostTypeId', 'ParentId', 'AcceptedAnswerId', 'CreationDate',
       'Score', 'ViewCount', 'Body', 'OwnerUserId', 'LastActivityDate',
       'Title', 'Tags', 'AnswerCount', 'CommentCount'],
      dtype='object')


In [30]:
# Add edges from comment interactions

with open('../data/raw/comments_typecasted_1.pkl', 'rb') as file:
    comments_data_1 = pickle.load(file)
with open('../data/raw/comments_typecasted_2.pkl', 'rb') as file:
    comments_data_2 = pickle.load(file)
with open('../data/raw/comments_typecasted_3.pkl', 'rb') as file:
    comments_data_3 = pickle.load(file)
with open('../data/raw/comments_typecasted_4.pkl', 'rb') as file:
    comments_data_4 = pickle.load(file)
with open('../data/raw/comments_typecasted_5.pkl', 'rb') as file:
    comments_data_5 = pickle.load(file)

# Combine all comment data
comments_data = pd.concat([comments_data_1, comments_data_2, comments_data_3, comments_data_4, comments_data_5], ignore_index=True)

# Print comment data columns
print('Comment data columns:', comments_data.columns)

Comment data columns: Index(['Id', 'PostId', 'Score', 'Text', 'CreationDate', 'UserId'], dtype='object')


In [31]:
# Add nodes to the graph
add_nodes_from_user_data(G, user_data)

# Print number of nodes
print('Number of nodes:', len(G.nodes()))

# Print user data columns
print('User data columns:', user_data.columns)

Number of nodes: 11810
User data columns: Index(['Id', 'Reputation', 'CreationDate', 'LastAccessDate', 'Views',
       'UpVotes', 'DownVotes', 'PostCount', 'CommentCount',
       'AcceptedAnswerCount', 'AnswerCount', 'TotalActivity', 'AvgAnswerScore',
       'AvgPostScore', 'AcceptedAnswerFraction', 'AnswerSentiment'],
      dtype='object')


In [32]:
# Preprocess posts data to only include posts from users in the network
posts_data = preprocess_post_data(posts_data, user_data)

In [33]:
# Add edges to the graph
add_edges_from_post_data(G, posts_data)
# Print number of edges
print('Number of edges:', len(G.edges()))

Number of edges: 2718


In [34]:
# Preprocess comments data to only include comments from users in the network
comments_data = preprocess_comment_data(comments_data, user_data, posts_data)

In [35]:
# Add edges to the graph
add_edges_from_comment_data(G, comments_data, posts_data)
# Print number of edges
print('Number of edges:', len(G.edges()))

Number of edges: 12150


In [37]:
# Convert all Timestamps to strings
convert_timestamps_to_strings(G)
# Save the network
nx.write_gexf(G, '../data/processed/network/network.gexf')


In [38]:
# Save the network
nx.write_gexf(G, '../data/processed/network/network.gexf')