# Network Data Generation

Network data: graphs with retweet and reply networks, where the vertices are accounts and the edges indicate that one account has replied to another, or retweeted a tweet from another. Think about what vertex and/or edge attributes could be useful to store.

In [17]:
import pandas as pd
import numpy as np  
import networkx as nx
import matplotlib.pyplot as plt
import json
import pickle
from datetime import datetime

## Load Data

In [None]:
# Utility function to parse ISO 8601 timestamps
def parse_created_at(ts):
    if not ts:
        return None
    # "2015-12-12T23:59:59.000Z"
    try:
        return datetime.fromisoformat(ts.replace("Z", "+00:00"))
    except Exception:
        return None

In [40]:
tweets_path = "tweets.dat"
tweets = []

with open(tweets_path, "r", encoding="utf-8") as f:
    for ln, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue
        try:
            tw = json.loads(line)
        except json.JSONDecodeError:
            # skip malformed lines
            continue

        # basic fields
        tweet_id = str(tw.get("id", "")).strip()
        author_id = str(tw.get("author_id", "")).strip()
        created_at = parse_created_at(tw.get("created_at"))
        lang = tw.get("lang")
        
        # interaction fields
        referenced_tweets = tw.get("referenced_tweets")
        in_reply_to_user_id = tw.get("in_reply_to_user_id")

        # public metrics
        metrics = tw.get("public_metrics") or {}
        # retweets = metrics.get("retweet_count")
        # replies = metrics.get("reply_count")
        # likes = metrics.get("like_count")
        # quotes = metrics.get("quote_count")

        
        # gather tweet record
        tweets.append({
            "id": tweet_id,
            "author_id": author_id,
            "created_at": created_at,
            "date": created_at.date().isoformat() if created_at else None,
            "lang": lang,
            
            "referenced_tweets": referenced_tweets,
            "in_reply_to_user_id": in_reply_to_user_id,
            
            # "retweet_count": retweets,
            # "reply_count": replies,
            # "like_count": likes,
            # "quote_count": quotes,
            "public_metrics": metrics # 保留完整的 metrics 字典，方便后续操作
        })

all_records = pd.DataFrame(tweets)

In [24]:
all_records.head()

Unnamed: 0,id,author_id,created_at,date,lang,referenced_tweets,in_reply_to_user_id,public_metrics
0,675827469119832066,1011975294,2015-12-12 23:59:59+00:00,2015-12-12,en,"[{'type': 'retweeted', 'id': '6757779674700390...",,"{'retweet_count': 107, 'reply_count': 0, 'like..."
1,675827469006581760,255144027,2015-12-12 23:59:59+00:00,2015-12-12,en,"[{'type': 'retweeted', 'id': '6757815547450572...",,"{'retweet_count': 109, 'reply_count': 0, 'like..."
2,675827468775718912,214748274,2015-12-12 23:59:59+00:00,2015-12-12,en,"[{'type': 'retweeted', 'id': '6757487202442977...",,"{'retweet_count': 1204, 'reply_count': 0, 'lik..."
3,675827465378504705,449273927,2015-12-12 23:59:58+00:00,2015-12-12,en,"[{'type': 'retweeted', 'id': '6757646325990440...",,"{'retweet_count': 110, 'reply_count': 0, 'like..."
4,675827465336434688,1601937732,2015-12-12 23:59:58+00:00,2015-12-12,en,"[{'type': 'retweeted', 'id': '6757735596884541...",,"{'retweet_count': 49, 'reply_count': 0, 'like_..."


In [45]:
all_records[50:100]

Unnamed: 0,id,author_id,created_at,date,lang,referenced_tweets,in_reply_to_user_id,public_metrics
50,675827382591184896,3181309406,2015-12-12 23:59:38+00:00,2015-12-12,fr,"[{'type': 'retweeted', 'id': '6757764980543815...",,"{'retweet_count': 85, 'reply_count': 0, 'like_..."
51,675827381941239808,2865992597,2015-12-12 23:59:38+00:00,2015-12-12,en,"[{'type': 'retweeted', 'id': '6757519537590804...",,"{'retweet_count': 160, 'reply_count': 0, 'like..."
52,675827377893715968,381440854,2015-12-12 23:59:37+00:00,2015-12-12,fr,"[{'type': 'retweeted', 'id': '6757459691416002...",,"{'retweet_count': 1621, 'reply_count': 0, 'lik..."
53,675827377394569216,3413674288,2015-12-12 23:59:37+00:00,2015-12-12,es,"[{'type': 'retweeted', 'id': '6758271067277967...",,"{'retweet_count': 52, 'reply_count': 0, 'like_..."
54,675827376274604035,35806640,2015-12-12 23:59:37+00:00,2015-12-12,en,"[{'type': 'retweeted', 'id': '6757617499084308...",,"{'retweet_count': 44, 'reply_count': 0, 'like_..."
55,675827371279319040,164695435,2015-12-12 23:59:36+00:00,2015-12-12,en,,,"{'retweet_count': 3, 'reply_count': 0, 'like_c..."
56,675827371207979009,2995837649,2015-12-12 23:59:36+00:00,2015-12-12,qme,,,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
57,675827368242515968,2171487918,2015-12-12 23:59:35+00:00,2015-12-12,en,"[{'type': 'replied_to', 'id': '675765926143221...",2390381622.0,"{'retweet_count': 0, 'reply_count': 0, 'like_c..."
58,675827360357199872,2429536974,2015-12-12 23:59:33+00:00,2015-12-12,en,"[{'type': 'retweeted', 'id': '6757238756201226...",,"{'retweet_count': 228, 'reply_count': 0, 'like..."
59,675827360181145601,229092117,2015-12-12 23:59:33+00:00,2015-12-12,fr,,968080993.0,"{'retweet_count': 1, 'reply_count': 0, 'like_c..."


In [26]:
len(all_records)

2260916

In [25]:
accounts = pd.read_csv(r"accounts.tsv", sep='\t', dtype={'author_id': str})
accounts.head()

Unnamed: 0,author_id,Type,Lang,Stance
0,8508262,Private individuals,fr,For
1,3297659759,Advocacy actors,es,For
2,1351436889316683778,Journalistic actors,en,For
3,259352661,Advocacy actors,en,For
4,17158610,Advocacy actors,en,For


## Network Graph

* Nodes(accounts.tsv): accounts，metrics: Type	Lang	Stance

* Edges(tweets.dat): tweets, type: reply / retweet，metrics: public_metrics - 'retweet_count', 'reply_count', 'like_count', 'quote_count'

In [41]:
## initialize graph
G = nx.MultiDiGraph()

## Add nodes
known_user_ids = set(accounts['author_id'].unique()) # author IDs in accounts dataset

node_list = [
    (row['author_id'], {'user_type': row['Type'], 'lang': row['Lang'], 'stance': row['Stance']}) 
    for idx, row in accounts.iterrows()
]
G.add_nodes_from(node_list)


## Add edges
# prepare mapping dataframe
tweet_author_df = all_records[['id', 'author_id']].rename(columns={'id': 'target_tweet_id', 'author_id': 'target_author_id'})

retweet_edges_data = []
reply_edges_data = []

# extract data from original records
for idx, row in all_records.iterrows():
    focal_node = row['author_id']
    
    # skip if focal_node not in known_user_ids
    if focal_node not in known_user_ids:
        continue
        
    public_metrics = row['public_metrics']
    edge_attrs = {
        'retweet_count': public_metrics['retweet_count'],
        'reply_count': public_metrics['reply_count'],
        'like_count': public_metrics['like_count'],
        'quote_count': public_metrics['quote_count']
    }
    
    # extract retweet edges
    if isinstance(row['referenced_tweets'], list):
        for ref_tweet in row['referenced_tweets']:
            if ref_tweet.get('type') == 'retweeted':
                # gather data for batch processing
                retweet_edges_data.append({
                    'source_author_id': focal_node,
                    'target_tweet_id': ref_tweet['id'], # JOIN key
                    **edge_attrs
                })

    # extract reply edges
    target_node = row.get('in_reply_to_user_id')
    if target_node and target_node in known_user_ids: # check if target user is in known_user_ids
        attrs = {'type': 'reply', **edge_attrs}
        reply_edges_data.append((focal_node, target_node, attrs))


# map retweet edges to author IDs in batch
if retweet_edges_data:
    retweet_df = pd.DataFrame(retweet_edges_data)
    
    # Merge: map target tweet IDs to author IDs
    merged_retweet_df = pd.merge(
        retweet_df, 
        tweet_author_df, 
        on='target_tweet_id', 
        how='left'
    )
    
    # ensure target author ID exists
    final_retweet_edges = merged_retweet_df[merged_retweet_df['target_author_id'].notna()]
    
    # ensure author IDs are in accounts dataset
    final_retweet_edges = final_retweet_edges[
        final_retweet_edges['target_author_id'].isin(known_user_ids)
    ]
    
    # add edges to graph
    batch_retweet_edges = []
    for _, row in final_retweet_edges.iterrows():
        u = row['source_author_id']
        v = row['target_author_id']
        
        # extract edge attributes
        attrs = row.drop(['source_author_id', 'target_tweet_id', 'target_author_id']).to_dict()
        attrs['type'] = 'retweet' 
        
        batch_retweet_edges.append((u, v, attrs))
    
    G.add_edges_from(batch_retweet_edges)

# add reply edges to graph
G.add_edges_from(reply_edges_data)

print(f"Successfully constructed the graph G.")
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")

Successfully constructed the graph G.
Nodes: 1936
Edges: 23220


In [31]:
G.nodes(data=True)

NodeDataView({'8508262': {'user_type': 'Private individuals', 'lang': 'fr', 'stance': 'For'}, '3297659759': {'user_type': 'Advocacy actors', 'lang': 'es', 'stance': 'For'}, '1351436889316683778': {'user_type': 'Journalistic actors', 'lang': 'en', 'stance': 'For'}, '259352661': {'user_type': 'Advocacy actors', 'lang': 'en', 'stance': 'For'}, '17158610': {'user_type': 'Advocacy actors', 'lang': 'en', 'stance': 'For'}, '23033425': {'user_type': 'Advocacy actors', 'lang': 'en', 'stance': 'Unclear'}, '955903212': {'user_type': 'Advocacy actors', 'lang': 'en', 'stance': 'For'}, '2482232082': {'user_type': 'Political actors', 'lang': 'es', 'stance': 'Unclear'}, '854703259776409600': {'user_type': 'Advocacy actors', 'lang': 'en', 'stance': 'For'}, '832970418': {'user_type': 'Advocacy actors', 'lang': 'en', 'stance': 'For'}, '915990619603869696': {'user_type': 'Advocacy actors', 'lang': 'en', 'stance': 'For'}, '771238809912520704': {'user_type': 'Advocacy actors', 'lang': 'de', 'stance': 'For'}

In [37]:
G.edges(data=False)

OutMultiEdgeDataView([('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8

In [42]:
# check parallel edges between two nodes
u = '8508262'
v = '88047464'

print(f"Edges between {u} and {v}:")

# require G.get_edge_data(u, v) to return a dict where key is the unique identifier of the edge
edge_dict = G.get_edge_data(u, v)

if edge_dict:
    for key, data in edge_dict.items():
        print(f"Key: {key}, Type: {data.get('type')}, Metrics: {data.get('retweet_count')}, ...")
else:
    print("No edges found.")

Edges between 8508262 and 88047464:
Key: 0, Type: retweet, Metrics: 8, ...
Key: 1, Type: retweet, Metrics: 12, ...
Key: 2, Type: retweet, Metrics: 51, ...
Key: 3, Type: retweet, Metrics: 9, ...
Key: 4, Type: retweet, Metrics: 17, ...
Key: 5, Type: retweet, Metrics: 14, ...
Key: 6, Type: retweet, Metrics: 6, ...
Key: 7, Type: retweet, Metrics: 15, ...
Key: 8, Type: retweet, Metrics: 3, ...
Key: 9, Type: retweet, Metrics: 4, ...
Key: 10, Type: retweet, Metrics: 4, ...
Key: 11, Type: retweet, Metrics: 9, ...
Key: 12, Type: retweet, Metrics: 6, ...
Key: 13, Type: retweet, Metrics: 3, ...
Key: 14, Type: retweet, Metrics: 4, ...
Key: 15, Type: retweet, Metrics: 3, ...
Key: 16, Type: retweet, Metrics: 19, ...
Key: 17, Type: retweet, Metrics: 15, ...
Key: 18, Type: retweet, Metrics: 3, ...
Key: 19, Type: retweet, Metrics: 3, ...
Key: 20, Type: retweet, Metrics: 4, ...
Key: 21, Type: retweet, Metrics: 5, ...
Key: 22, Type: retweet, Metrics: 7, ...
Key: 23, Type: retweet, Metrics: 8, ...
Key: 24

In [43]:
# save graph data
file_path_pkl = 'network_data.pkl'
with open(file_path_pkl, 'wb') as f: # 注意 'wb' (写入二进制)
    pickle.dump(G, f)

In [None]:
# Load graph data
import pickle
file_path_pkl = 'network_data.pkl'
with open(file_path_pkl, 'rb') as f: # 注意 'rb' (读取二进制)
    G_loaded_pkl = pickle.load(f)

In [8]:
G_loaded_pkl.nodes

NodeView(('8508262', '3297659759', '1351436889316683778', '259352661', '17158610', '23033425', '955903212', '2482232082', '854703259776409600', '832970418', '915990619603869696', '771238809912520704', '3119820711', '1497787496', '1447525740421976071', '37271053', '1453753755321708547', '1517801801864593409', '401371394', '1031506566355341312', '2823973398', '77254498', '2898075617', '972177106699472896', '1123591446160973825', '478556898', '2704725908', '4696381484', '2374469136', '1198920176072298496', '2803368067', '1369576148540157954', '49155241', '819312344543166465', '3971883682', '1423477933402849282', '2649991', '910160900497297408', '14266598', '2518143973', '569880272', '1161326804759531520', '1426970262', '4502695341', '752785900078235648', '256572677', '1593463831', '718159641721630721', '3026289274', '173972286', '881933605', '3232924232', '3373831665', '108648075', '2749145822', '156322365', '250849122', '44136279', '4264616105', '1270435562', '127296681', '92707696150307

In [9]:
G_loaded_pkl.edges()

OutMultiEdgeDataView([('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8508262', '88047464'), ('8

In [None]:
## plot the network graph
# plt.figure(figsize=(12,12))
# pos = nx.spring_layout(G_loaded_pkl, k=0.15, iterations=20) # 使用 G_loaded_pkl 而不是旧的 G


# nx.draw(
#     G_loaded_pkl, 
#     pos,
#     with_labels=True, 
#     node_size=500, 
#     node_color='lightblue', 
#     font_size=10, 
#     font_color='black', 
#     font_weight='bold', 
#     edge_color='gray'
# )
# plt.title("Loaded Graph Visualization")
# plt.show()