# Network Data Generation

Network data: graphs with retweet and reply networks, where the vertices are accounts and the edges indicate that one account has replied to another, or retweeted a tweet from another. Think about what vertex and/or edge attributes could be useful to store.

In [None]:
import pandas as pd
import numpy as np  
import networkx as nx
import matplotlib.pyplot as plt
import json
import pickle
from datetime import datetime

## Load Data

In [None]:
# Utility function to parse ISO 8601 timestamps
def parse_created_at(ts):
    if not ts:
        return None
    # "2015-12-12T23:59:59.000Z"
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00"))
    except Exception:
        return None

In [None]:
tweets_path = "../data/tweets.dat"
tweets = []

with open(tweets_path, "r", encoding="utf-8") as f:
    for ln, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue
        try:
            tw = json.loads(line)
        except json.JSONDecodeError:
            # skip malformed lines
            continue

        # basic fields
        tweet_id = str(tw.get("id", "")).strip()
        author_id = str(tw.get("author_id", "")).strip()
        created_at = parse_created_at(tw.get("created_at"))
        lang = tw.get("lang")
        
        # interaction fields
        referenced_tweets = tw.get("referenced_tweets")
        in_reply_to_user_id = tw.get("in_reply_to_user_id")

        # public metrics
        metrics = tw.get("public_metrics") or {}
        # retweets = metrics.get("retweet_count")
        # replies = metrics.get("reply_count")
        # likes = metrics.get("like_count")
        # quotes = metrics.get("quote_count")

        
        # gather tweet record
        tweets.append({
            "id": tweet_id,
            "author_id": author_id,
            "created_at": created_at,
            "date": created_at.date().isoformat() if created_at else None,
            "lang": lang,
            
            "referenced_tweets": referenced_tweets,
            "in_reply_to_user_id": in_reply_to_user_id,
            
            # "retweet_count": retweets,
            # "reply_count": replies,
            # "like_count": likes,
            # "quote_count": quotes,
            "public_metrics": metrics # 保留完整的 metrics 字典，方便后续操作
        })

all_records = pd.DataFrame(tweets)

In [None]:
all_records.head()

In [None]:
all_records[50:100]

In [None]:
len(all_records)

In [None]:
accounts = pd.read_csv(r"../data/accounts.tsv", sep='\t', dtype={'author_id': str})
accounts.head()

## Network Graph

* Nodes(accounts.tsv): accounts，metrics: Type	Lang	Stance

* Edges(tweets.dat): tweets, type: reply / retweet，metrics: public_metrics - 'retweet_count', 'reply_count', 'like_count', 'quote_count'

In [None]:
## initialize graph
G = nx.MultiDiGraph()

## Add nodes
known_user_ids = set(accounts['author_id'].unique()) # author IDs in accounts dataset

node_list = [
    (row['author_id'], {'user_type': row['Type'], 'lang': row['Lang'], 'stance': row['Stance']}) 
    for idx, row in accounts.iterrows()
]
G.add_nodes_from(node_list)


## Add edges
# prepare mapping dataframe
tweet_author_df = all_records[['id', 'author_id']].rename(columns={'id': 'target_tweet_id', 'author_id': 'target_author_id'})

retweet_edges_data = []
reply_edges_data = []

# extract data from original records
for idx, row in all_records.iterrows():
    focal_node = row['author_id']
    
    # skip if focal_node not in known_user_ids
    if focal_node not in known_user_ids:
        continue
        
    public_metrics = row['public_metrics']
    edge_attrs = {
        'retweet_count': public_metrics['retweet_count'],
        'reply_count': public_metrics['reply_count'],
        'like_count': public_metrics['like_count'],
        'quote_count': public_metrics['quote_count']
    }
    
    # extract retweet edges
    if isinstance(row['referenced_tweets'], list):
        for ref_tweet in row['referenced_tweets']:
            if ref_tweet.get('type') == 'retweeted':
                # gather data for batch processing
                retweet_edges_data.append({
                    'source_author_id': focal_node,
                    'target_tweet_id': ref_tweet['id'], # JOIN key
                    **edge_attrs
                })

    # extract reply edges
    target_node = row.get('in_reply_to_user_id')
    if target_node and target_node in known_user_ids: # check if target user is in known_user_ids
        attrs = {'type': 'reply', **edge_attrs}
        reply_edges_data.append((focal_node, target_node, attrs))


# map retweet edges to author IDs in batch
if retweet_edges_data:
    retweet_df = pd.DataFrame(retweet_edges_data)
    
    # Merge: map target tweet IDs to author IDs
    merged_retweet_df = pd.merge(
        retweet_df, 
        tweet_author_df, 
        on='target_tweet_id', 
        how='left'
    )
    
    # ensure target author ID exists
    final_retweet_edges = merged_retweet_df[merged_retweet_df['target_author_id'].notna()]
    
    # ensure author IDs are in accounts dataset
    final_retweet_edges = final_retweet_edges[
        final_retweet_edges['target_author_id'].isin(known_user_ids)
    ]
    
    # add edges to graph
    batch_retweet_edges = []
    for _, row in final_retweet_edges.iterrows():
        u = row['source_author_id']
        v = row['target_author_id']
        
        # extract edge attributes
        attrs = row.drop(['source_author_id', 'target_tweet_id', 'target_author_id']).to_dict()
        attrs['type'] = 'retweet' 
        
        batch_retweet_edges.append((u, v, attrs))
    
    G.add_edges_from(batch_retweet_edges)

# add reply edges to graph
G.add_edges_from(reply_edges_data)

print(f"Successfully constructed the graph G.")
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")

In [None]:
G.nodes(data=True)

In [None]:
G.edges(data=False)

In [None]:
# check parallel edges between two nodes
u = '8508262'
v = '88047464'

print(f"Edges between {u} and {v}:")

# require G.get_edge_data(u, v) to return a dict where key is the unique identifier of the edge
edge_dict = G.get_edge_data(u, v)

if edge_dict:
    for key, data in edge_dict.items():
        print(f"Key: {key}, Type: {data.get('type')}, Metrics: {data.get('retweet_count')}, ...")
else:
    print("No edges found.")

In [None]:
# save graph data
file_path_pkl = 'network_data.pkl'
with open(file_path_pkl, 'wb') as f: # 注意 'wb' (写入二进制)
    pickle.dump(G, f)

In [None]:
# Load graph data
import pickle
file_path_pkl = 'network_data.pkl'
with open(file_path_pkl, 'rb') as f: # 注意 'rb' (读取二进制)
    G_loaded_pkl = pickle.load(f)

In [None]:
G_loaded_pkl.nodes

In [None]:
G_loaded_pkl.edges()

In [None]:
## plot the network graph
# plt.figure(figsize=(12,12))
# pos = nx.spring_layout(G_loaded_pkl, k=0.15, iterations=20) # 使用 G_loaded_pkl 而不是旧的 G


# nx.draw(
#     G_loaded_pkl, 
#     pos,
#     with_labels=True, 
#     node_size=500, 
#     node_color='lightblue', 
#     font_size=10, 
#     font_color='black', 
#     font_weight='bold', 
#     edge_color='gray'
# )
# plt.title("Loaded Graph Visualization")
# plt.show()