In [1]:
import sys
sys.path.append('clustering/')
from utils import *

In [14]:
import numpy as np
import scipy.sparse as sp
import os
import json

with open(os.path.join(get_data_dir(), '2010s', 'direct_citation', 'vid_to_index.json'), 'r') as f:
    vid_to_index = json.load(f)
print("VID to index mapping loaded successfully!")

# 读取稀疏矩阵
loaded_data = np.load(os.path.join(get_data_dir(), '2010s', 'direct_citation', 'citation_matrix.npz'))
citation_matrix = sp.coo_matrix((loaded_data['data'], (loaded_data['row'], loaded_data['col'])), shape=(len(vid_to_index), len(vid_to_index)))
print("Citation matrix loaded successfully!")

data_dir:  /home/lyuzhuoqi/projects/clustering/data
VID to index mapping loaded successfully!
data_dir:  /home/lyuzhuoqi/projects/clustering/data
Citation matrix loaded successfully!


In [2]:
import numpy as np
import networkx as nx

# 生成小世界网络
num_nodes = 1000  # 节点数
k = 4  # 每个节点连接的邻居数
p = 0.1  # 重连概率
G = nx.watts_strogatz_graph(num_nodes, k, p)

# 获取邻接矩阵的COO格式
citation_matrix = nx.adjacency_matrix(G).tocoo()  # 转换为COO格式

# 打印稀疏矩阵的基本信息
print("Small-World Network Sparse Matrix (COO format):")
print(citation_matrix)

# 生成1到10之间的随机权重，赋值给稀疏矩阵的边
edge_weights = np.random.randint(1, 11, size=citation_matrix.nnz)
citation_matrix.data = edge_weights

# 输出带有权重的稀疏矩阵
print("Sparse Matrix with Edge Weights (COO format):")
print(citation_matrix)

Small-World Network Sparse Matrix (COO format):
  (0, 1)	1
  (0, 2)	1
  (0, 998)	1
  (0, 999)	1
  (1, 0)	1
  (1, 2)	1
  (1, 3)	1
  (1, 999)	1
  (2, 0)	1
  (2, 1)	1
  (2, 3)	1
  (2, 4)	1
  (3, 1)	1
  (3, 2)	1
  (3, 4)	1
  (3, 5)	1
  (4, 2)	1
  (4, 3)	1
  (4, 5)	1
  (4, 6)	1
  (5, 3)	1
  (5, 4)	1
  (5, 6)	1
  (5, 459)	1
  (6, 4)	1
  :	:
  (993, 994)	1
  (993, 995)	1
  (994, 992)	1
  (994, 993)	1
  (994, 995)	1
  (994, 996)	1
  (995, 993)	1
  (995, 994)	1
  (995, 996)	1
  (995, 997)	1
  (996, 378)	1
  (996, 923)	1
  (996, 994)	1
  (996, 995)	1
  (996, 997)	1
  (997, 80)	1
  (997, 995)	1
  (997, 996)	1
  (997, 998)	1
  (998, 0)	1
  (998, 997)	1
  (998, 999)	1
  (999, 0)	1
  (999, 1)	1
  (999, 998)	1
Sparse Matrix with Edge Weights (COO format):
  (0, 1)	2
  (0, 2)	8
  (0, 998)	4
  (0, 999)	8
  (1, 0)	6
  (1, 2)	1
  (1, 3)	1
  (1, 999)	7
  (2, 0)	8
  (2, 1)	8
  (2, 3)	10
  (2, 4)	8
  (3, 1)	4
  (3, 2)	9
  (3, 4)	9
  (3, 5)	4
  (4, 2)	5
  (4, 3)	6
  (4, 5)	9
  (4, 6)	4
  (5, 3)	3
  (5, 4)	8




In [8]:
import torch
from torch_geometric.data import Data
import numpy as np

# 转换稀疏矩阵为 PyTorch Geometric 所需的格式
# edge_index 是一个包含边的矩阵（2, E），其中 E 是边的数量
# edge_attr 是边的权重
edge_index = torch.tensor(np.array(np.nonzero(citation_matrix > 0)), dtype=torch.long)
edge_attr = torch.tensor(citation_matrix.data, dtype=torch.float)
# 构建 edge_set，存储正样本边集
edge_set = set(zip(edge_index[0].cpu().numpy(), edge_index[1].cpu().numpy()))
# 确保 edge_index 和 edge_attr 的一致性
print(f"Edge Index Shape: {edge_index.shape}")
print(f"Edge Attr Shape: {edge_attr.shape}")

# 创建 PyG 的数据对象
data = Data(edge_index=edge_index, edge_attr=edge_attr)

# 假设每个节点的特征是从标准正态分布中随机生成的，维度为 64
x = torch.randn((num_nodes, 64))  # 每个节点的特征为一个随机向量（形状为 [n, 64]）

# 将节点特征加入 Data 对象
data.x = x

# 检查 GPU 是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 将数据转移到GPU
data = data.to(device)
edge_index = edge_index.to(device)

# 打印数据的基本信息
print(f"Data x shape: {data.x.shape}")
print(f"Data edge_index shape: {data.edge_index.shape}")

Edge Index Shape: torch.Size([2, 4000])
Edge Attr Shape: torch.Size([4000])
Using device: cuda
Data x shape: torch.Size([1000, 64])
Data edge_index shape: torch.Size([2, 4000])


In [10]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv  # 使用GATConv
import random

# 定义 GAT 模型
class GAT(torch.nn.Module):
    def __init__(self, in_channels, out_channels, num_heads=8):
        super(GAT, self).__init__()
        # 使用 GATConv 层来替代 GCNConv
        self.gat_conv1 = GATConv(in_channels, out_channels, heads=num_heads)
        self.gat_conv2 = GATConv(out_channels * num_heads, out_channels, heads=1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        # 第一层 GAT 卷积
        x = self.gat_conv1(x, edge_index)
        x = F.relu(x)
        # 第二层 GAT 卷积
        x = self.gat_conv2(x, edge_index)
        return x

# 创建 GAT 模型
model = GAT(in_channels=64, out_channels=128, num_heads=8).to(device)

# 负采样函数：生成负样本
def negative_sampling(edge_set, num_nodes, num_samples, device):
    neg_samples = []
    while len(neg_samples) < num_samples:
        u = random.randint(0, num_nodes - 1)
        v = random.randint(0, num_nodes - 1)
        if u != v:
            # 检查 u 和 v 是否在正样本的边中
            if (u, v) not in edge_set and (v, u) not in edge_set:
                neg_samples.append((u, v))

    # 将负样本转移到指定设备（GPU）
    neg_samples = torch.tensor(neg_samples, dtype=torch.long).t().contiguous().to(device)
    return neg_samples

# 训练函数
def train(data, edge_set, model, optimizer, criterion, epochs=100, num_neg_samples=None):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        
        # 获取模型的节点嵌入
        out = model(data)

        # 获取正样本的节点对
        pos_edge_index = data.edge_index
        num_pos_samples = pos_edge_index.size(1)  # 正样本数量
        
        # 如果没有给定负样本数量，则设置为与正样本数量相同
        num_neg_samples = num_neg_samples or num_pos_samples

        # 生成负样本
        neg_edge_index = negative_sampling(edge_set, len(data.x), num_neg_samples, device)
        
        # 正样本的嵌入
        pos_emb1 = out[pos_edge_index[0]]
        pos_emb2 = out[pos_edge_index[1]]
        
        # 负样本的嵌入
        neg_emb1 = out[neg_edge_index[0]]
        neg_emb2 = out[neg_edge_index[1]]

        # 计算正样本和负样本的内积作为链接预测的相似度
        pos_score = (pos_emb1 * pos_emb2).sum(dim=1)
        neg_score = (neg_emb1 * neg_emb2).sum(dim=1)

        # 标签：1表示正样本，0表示负样本
        labels = torch.cat([torch.ones(pos_score.size(0)), torch.zeros(neg_score.size(0))], dim=0).to(device)
        scores = torch.cat([pos_score, neg_score], dim=0)

        # 计算损失
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()
        
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')


In [11]:
# 定义损失函数和优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
criterion = torch.nn.BCEWithLogitsLoss()

# 开始训练
train(data, edge_set, model, optimizer, criterion, epochs=100)

Epoch 1/100, Loss: 0.7512
Epoch 2/100, Loss: 5145.7310
Epoch 3/100, Loss: 62.8023
Epoch 4/100, Loss: 291.1995
Epoch 5/100, Loss: 360.1705
Epoch 6/100, Loss: 331.6060
Epoch 7/100, Loss: 178.7528
Epoch 8/100, Loss: 39.0891
Epoch 9/100, Loss: 51.4761
Epoch 10/100, Loss: 62.8760
Epoch 11/100, Loss: 52.5718
Epoch 12/100, Loss: 36.9284
Epoch 13/100, Loss: 28.9054
Epoch 14/100, Loss: 26.1920
Epoch 15/100, Loss: 20.8785
Epoch 16/100, Loss: 14.5546
Epoch 17/100, Loss: 11.2020
Epoch 18/100, Loss: 11.3542
Epoch 19/100, Loss: 10.2788
Epoch 20/100, Loss: 10.1894
Epoch 21/100, Loss: 7.9771
Epoch 22/100, Loss: 6.5460
Epoch 23/100, Loss: 5.8507
Epoch 24/100, Loss: 5.2182
Epoch 25/100, Loss: 4.3573
Epoch 26/100, Loss: 4.1593
Epoch 27/100, Loss: 3.5584
Epoch 28/100, Loss: 3.0489
Epoch 29/100, Loss: 3.1759
Epoch 30/100, Loss: 2.5134
Epoch 31/100, Loss: 2.6336
Epoch 32/100, Loss: 2.2371
Epoch 33/100, Loss: 2.0251
Epoch 34/100, Loss: 2.0785
Epoch 35/100, Loss: 1.7828
Epoch 36/100, Loss: 1.7306
Epoch 37/100