# Network Data Generation

Network data: graphs with retweet and reply networks, where the vertices are accounts and the edges indicate that one account has replied to another, or retweeted a tweet from another. Think about what vertex and/or edge attributes could be useful to store.

In [None]:
import pandas as pd
import numpy as np  
import networkx as nx
import matplotlib.pyplot as plt
import json
import pickle

## 加载数据

In [None]:
## 读取 tweets.dat 文件中的所有记录
all_records = []
with open(r"tweets.dat", 'r', encoding='utf-8') as f:
    record_count = 0
    for line in f:
            # 忽略空行或只有空格的行
            if line.strip():
                try:
                    # 使用 json.loads() 解析每一行的 JSON 字符串
                    record = json.loads(line)
                    all_records.append(record)
                    record_count += 1
                except json.JSONDecodeError as e:
                    print(f"⚠️ 警告: 第 {record_count + 1} 行 JSON 解码失败，跳过。错误: {e}")


In [None]:
all_records = pd.DataFrame(all_records)
all_records.head()

In [None]:
len(all_records)

In [None]:
accounts = pd.read_csv(r"accounts.tsv", sep='\t', dtype={'author_id': str})
accounts.head()

In [None]:
accounts['author_id'][0]

## 构建网络图

* 节点(accounts.tsv)：账户，属性：Type	Lang	Stance

* 边(tweets.dat)：回复（reply） / 转发（retweet），属性：public_metrics - 'retweet_count', 'reply_count', 'like_count', 'quote_count'

In [None]:
# ----------------------------------------------------
# 阶段 1: 节点初始化 (只基于 accounts)
# ----------------------------------------------------
G = nx.MultiDiGraph()
known_user_ids = set(accounts['author_id'].unique()) # 存储已知用户集合，用于快速查找

node_list = [
    (row['author_id'], {'user_type': row['Type'], 'lang': row['Lang'], 'stance': row['Stance']}) 
    for idx, row in accounts.iterrows()
]
G.add_nodes_from(node_list)


# ----------------------------------------------------
# 阶段 2 & 3: 边构建与批量匹配
# ----------------------------------------------------

tweet_author_df = all_records[['id', 'author_id']].rename(columns={'id': 'target_tweet_id', 'author_id': 'target_author_id'})

retweet_edges_data = []
reply_edges_data = []

# 提取数据 (保持与原代码类似的逻辑，迭代 all_records)
for idx, row in all_records.iterrows():
    focal_node = row['author_id']
    
    # 如果发起者都不在已知用户列表中，跳过这条推文的交互
    if focal_node not in known_user_ids:
        continue
        
    public_metrics = row['public_metrics']
    edge_attrs = {
        'retweet_count': public_metrics['retweet_count'],
        'reply_count': public_metrics['reply_count'],
        'like_count': public_metrics['like_count'],
        'quote_count': public_metrics['quote_count']
    }
    
    # --- A. Retweet 关系提取 ---
    if isinstance(row['referenced_tweets'], list):
        for ref_tweet in row['referenced_tweets']:
            if ref_tweet.get('type') == 'retweeted':
                # 收集原始关系数据
                retweet_edges_data.append({
                    'source_author_id': focal_node,
                    'target_tweet_id': ref_tweet['id'], # 用于 JOIN
                    **edge_attrs
                })

    # --- B. Reply 关系提取 ---
    target_node = row.get('in_reply_to_user_id')
    if target_node and target_node in known_user_ids: # 检查目标用户是否在 known_user_ids 中
        attrs = {'type': 'reply', **edge_attrs}
        reply_edges_data.append((focal_node, target_node, attrs))


# ----------------------------------------------------
# 阶段 4: 批量匹配 Retweet 关系作者 (SQL Left Join / Pandas Merge)
# ----------------------------------------------------

if retweet_edges_data:
    retweet_df = pd.DataFrame(retweet_edges_data)
    
    # Merge: 将 Retweet 关系与目标作者 ID 匹配
    merged_retweet_df = pd.merge(
        retweet_df, 
        tweet_author_df, 
        on='target_tweet_id', 
        how='left'
    )
    
    # 筛选 1: 确保目标推文找到了作者 ID (即 merge 成功)
    final_retweet_edges = merged_retweet_df[merged_retweet_df['target_author_id'].notna()]
    
    # 筛选 2: 确保目标作者 ID 存在于我们建图的 G.nodes() 集合中
    final_retweet_edges = final_retweet_edges[
        final_retweet_edges['target_author_id'].isin(known_user_ids)
    ]
    
    # 批量添加 Retweet 边
    batch_retweet_edges = []
    for _, row in final_retweet_edges.iterrows():
        u = row['source_author_id']
        v = row['target_author_id']
        
        # 提取边属性
        attrs = row.drop(['source_author_id', 'target_tweet_id', 'target_author_id']).to_dict()
        attrs['type'] = 'retweet' 
        
        batch_retweet_edges.append((u, v, attrs))
    
    G.add_edges_from(batch_retweet_edges)


# ----------------------------------------------------
# 阶段 5: 批量建图 (Reply)
# ----------------------------------------------------
G.add_edges_from(reply_edges_data)

print(f"✅ 图网络构建完成。")
print(f"节点数: {G.number_of_nodes()}")
print(f"边数 (交互次数): {G.number_of_edges()}")

In [None]:
G.edges("1314870421")

In [None]:
# 存储图数据
file_path_pkl = 'network_data.pkl'
with open(file_path_pkl, 'wb') as f: # 注意 'wb' (写入二进制)
    pickle.dump(G, f)

In [None]:
# 读取图数据
with open(file_path_pkl, 'rb') as f: # 注意 'rb' (读取二进制)
    G_loaded_pkl = pickle.load(f)

In [None]:
G.nodes()

In [None]:
G_loaded_pkl.nodes

In [None]:
len(G_loaded_pkl.edges())

In [None]:
# # 画图
# plt.figure(figsize=(12,12))
# pos = nx.spring_layout(G_loaded_pkl, k=0.15, iterations=20) # 使用 G_loaded_pkl 而不是旧的 G

# # 绘制
# nx.draw(
#     G_loaded_pkl, 
#     pos,
#     with_labels=True, 
#     node_size=500, 
#     node_color='lightblue', 
#     font_size=10, 
#     font_color='black', 
#     font_weight='bold', 
#     edge_color='gray'
# )
# plt.title("Loaded Graph Visualization")
# plt.show()