In [None]:
import numpy as np
np.random.seed(0)
import torch
torch.manual_seed(0)
import matplotlib.pyplot as plt
import torch_geometric.transforms as T # PyG의 그래프 전처리를 담당
from torch_geometric.datasets import Planetoid

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
transform = T.Compose([ # 여러 전처리 함수들을 묶어서 한번에 적용
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05,num_test=0.1,is_undirected=True,split_labels=True,add_negative_train_samples=False)
])
# 링크 예측할 때는 전처리가 표준적임

In [6]:
dataset = Planetoid(root='.',name='Cora',transform=transform)

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [7]:
dataset[0]

(Data(x=[2708, 1433], edge_index=[2, 8976], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], pos_edge_label=[4488], pos_edge_label_index=[2, 4488]),
 Data(x=[2708, 1433], edge_index=[2, 8976], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], pos_edge_label=[263], pos_edge_label_index=[2, 263], neg_edge_label=[263], neg_edge_label_index=[2, 263]),
 Data(x=[2708, 1433], edge_index=[2, 9502], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], pos_edge_label=[527], pos_edge_label_index=[2, 527], neg_edge_label=[527], neg_edge_label_index=[2, 527]))

In [11]:
train_data, val_data, test_data = dataset[0]

In [19]:
from torch_geometric.nn import GCNConv, VGAE

class Encoder(torch.nn.Module):
    def __init__(self,dim_in,dim_out):
        super().__init__()
        self.conv1 = GCNConv(dim_in,2*dim_out)
        self.conv_mu = GCNConv(2*dim_out,dim_out)
        self.conv_logstd = GCNConv(2*dim_out,dim_out)

    def forward(self,x,edge_index):
        x = self.conv1(x,edge_index).relu()
        return self.conv_mu(x,edge_index),self.conv_logstd(x,edge_index)

In [20]:
model = VGAE(Encoder(dataset.num_features,16)).to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

In [21]:
print(model)

VGAE(
  (encoder): Encoder(
    (conv1): GCNConv(1433, 32)
    (conv_mu): GCNConv(32, 16)
    (conv_logstd): GCNConv(32, 16)
  )
  (decoder): InnerProductDecoder()
)


In [22]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x,train_data.edge_index)
    loss = model.recon_loss(z,train_data.pos_edge_label_index) + (1 / train_data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)

In [23]:
@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x,data.edge_index)
    return model.test(z,data.pos_edge_label_index,data.neg_edge_label_index)

In [24]:
for epoch in range(301):
    loss = train()
    val_auc, val_ap = test(val_data)
    if epoch%50==0:
        print(f'Epoch : {epoch:>2} | Loss : {loss:.4f} | Val_AUC : {val_auc:.4f} | Val_AP : {val_ap:.4f}')

Epoch :  0 | Loss : 3.4739 | Val_AUC : 0.6651 | Val_AP : 0.6776
Epoch : 50 | Loss : 1.3303 | Val_AUC : 0.6397 | Val_AP : 0.6627
Epoch : 100 | Loss : 1.1553 | Val_AUC : 0.7237 | Val_AP : 0.7203
Epoch : 150 | Loss : 1.1107 | Val_AUC : 0.7352 | Val_AP : 0.7329
Epoch : 200 | Loss : 0.9996 | Val_AUC : 0.8336 | Val_AP : 0.8208
Epoch : 250 | Loss : 0.9542 | Val_AUC : 0.8632 | Val_AP : 0.8590
Epoch : 300 | Loss : 0.9525 | Val_AUC : 0.8743 | Val_AP : 0.8721


In [25]:
test_auc, test_ap = test(test_data)
print(f'Test_AUC : {val_auc:.4f} | Test_AP : {val_ap:.4f}')

Test_AUC : 0.8743 | Test_AP : 0.8721


In [None]:
z = model.encode(test_data.x,test_data.edge_index) # Z는 노드 임베딩
Ahat = torch.sigmoid(z @ z.T)
print(Ahat) # 노드 쌍이 연결된 강도, 확률, 가중치

tensor([[0.7769, 0.6252, 0.7670,  ..., 0.5609, 0.8333, 0.7781],
        [0.6252, 0.8309, 0.8431,  ..., 0.5446, 0.6965, 0.6479],
        [0.7670, 0.8431, 0.9042,  ..., 0.5950, 0.8549, 0.7964],
        ...,
        [0.5609, 0.5446, 0.5950,  ..., 0.5456, 0.6046, 0.5855],
        [0.8333, 0.6965, 0.8549,  ..., 0.6046, 0.9012, 0.8498],
        [0.7781, 0.6479, 0.7964,  ..., 0.5855, 0.8498, 0.7963]],
       grad_fn=<SigmoidBackward0>)


SEAL

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score # ML 평가지표
from scipy.sparse.csgraph import shortest_path # 희소 행렬에서 최단거리 계산

import torch
import torch.nn.functional as F
from torch.nn import Conv1d,MaxPool1d,Linear,Dropout,BCEWithLogitsLoss 

from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, aggr
from torch_geometric.utils import k_hop_subgraph, to_scipy_sparse_matrix
# k-hop : 누 노드 기준 k-hop 이웃으로 구성된 서브그래프 생성
# to_scipy : PyG의 엣지 인덱스를 scipy의 sparse_matrix로 변환

In [None]:
transform = RandomLinkSplit(num_val=0.05,num_test=0.1,is_undirected=True,split_labels=True)
# 이웃 기반이므로, 노드를 정규화할 필요 x
dataset = Planetoid(root='.',name='Cora',transform=transform) 
train_data,val_data,test_data = dataset[0]

In [36]:
print(train_data)

Data(x=[2708, 1433], edge_index=[2, 8976], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], pos_edge_label=[4488], pos_edge_label_index=[2, 4488], neg_edge_label=[4488], neg_edge_label_index=[2, 4488])


In [62]:
def seal_processing(dataset,edge_label_index,y):
    data_list = []
    for src,dst in edge_label_index.T.tolist(): # numpy & Tensor를 list로 변환
        sub_nodes, sub_edge_index, mapping, _ = k_hop_subgraph([src,dst],2,dataset.edge_index,relabel_nodes=True)
        # (src,dst)를 기준으로 2홉 이웃을 가져오고, 리라벨링을 한다.
        # sub_nodes : 서브 그래프 노드 집합
        # sub_edge_index : 서브그래프 기준 새 번호로 엣지 구성
        # mapping : (src,dst)가 서브그래프 기준으로 어떤 인덱스로 바뀌었는지
        # _ = edge_mask = 원래 edge_index 중 어떤 엣지가 서브그래프에 포함됐는지
        src,dst = mapping.tolist()
        mask1 = (sub_edge_index[0]!=src) | (sub_edge_index[1]!=dst)
        mask2 = (sub_edge_index[0]!=dst) | (sub_edge_index[1]!=src)
        sub_edge_index = sub_edge_index[:,mask1&mask2] # 모든 행 기준으로 열 조건을 만족하는 것만 추출

        src,dst = (dst,src) if src>dst else (src,dst)
        adj = to_scipy_sparse_matrix(sub_edge_index,num_nodes=sub_nodes.size(0)).tocsr() # 행렬 생성

        idx = list(range(src)) + list(range(src+1,adj.shape[0])) #src를 제외한 인덱싱
        adj_wo_src = adj[idx,:][:,idx] #제외한 행렬

        idx = list(range(dst)) + list(range(dst+1,adj.shape[0])) #dst를 제외한 인덱싱
        adj_wo_dst = adj[idx,:][:,idx] #제외한 행렬

        d_src = shortest_path(adj_wo_dst, directed=False,unweighted=True,indices = src) # indices = 출발점
        #dst를 제거한 행렬에서, src와의 거리
        d_src = np.insert(d_src,dst,0,axis=0) # 행렬의 dst 위치에 0 삽입
        d_src = torch.from_numpy(d_src)

        d_dst = shortest_path(adj_wo_src, directed=True,unweighted=True,indices= dst-1) # 출발노드가 도착노드보다 앞서있으므로 -1
        d_dst = np.insert(d_dst,src,0,axis=0)
        d_dst = torch.from_numpy(d_dst)

        # 공식에 맞게 노드레이블 계산
        dist = d_src + d_dst
        z = 1+ torch.min(d_src,d_dst) + dist // 2 * (dist // 2 + dist % 2 - 1)
        z[src], z[dst], z[torch.isnan(z)] = 1.,1.,0. # 중심 노드는 1로 라벨링.
        z = z.to(torch.long) # 정수로 표현 (int보다 크게)

        node_labels = F.one_hot(z,num_classes=200).to(torch.float)
        node_emb = dataset.x[sub_nodes]
        node_x = torch.cat([node_emb,node_labels],dim=1)

        data = Data(node_x,z=z,edge_index=sub_edge_index,y=y)
        data_list.append(data)

    return data_list


In [63]:
train_pos_data_list = seal_processing(train_data,train_data.pos_edge_label_index,1)
train_neg_data_list = seal_processing(train_data,train_data.neg_edge_label_index,0)

val_pos_data_list = seal_processing(val_data,val_data.pos_edge_label_index,1)
val_neg_data_list = seal_processing(val_data,val_data.neg_edge_label_index,0)

test_pos_data_list = seal_processing(test_data,test_data.pos_edge_label_index,1)
test_neg_data_list = seal_processing(test_data,test_data.neg_edge_label_index,0)

In [65]:
train_dataset = train_pos_data_list + train_neg_data_list
val_dataset = val_pos_data_list + val_neg_data_list
test_dataset = test_pos_data_list + test_neg_data_list

In [67]:
train_loader = DataLoader(train_dataset,batch_size=32,shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=32,shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=32,shuffle=True)

In [76]:
class DGCNN(torch.nn.Module):
    def __init__(self,dim_in,k=30):
        super().__init__()
        self.gcn1 = GCNConv(dim_in,32)
        self.gcn2 = GCNConv(32,32)
        self.gcn3 = GCNConv(32,32)
        self.gcn4 = GCNConv(32,1)

        self.global_pool = aggr.SortAggregation(k=k) # 중요도가 높은 노드 선택

        self.conv1 = Conv1d(1,16,97,97)
        self.conv2 = Conv1d(16,32,5,1)
        self.maxpool = MaxPool1d(2,2)

        self.linear1 = Linear(352,128)
        self.dropout = Dropout(0.5)
        self.linear2 = Linear(128,1)

    def forward(self,x,edge_index,batch):
        h1 = self.gcn1(x,edge_index).tanh()
        h2 = self.gcn2(h1,edge_index).tanh()
        h3 = self.gcn3(h2,edge_index).tanh()
        h4 = self.gcn4(h3,edge_index).tanh()
        h = torch.cat([h1,h2,h3,h4],dim=-1)

        h = self.global_pool(h,batch)
        h = h.view(h.size(0),1,h.size(-1)) # conv1d 형태로 맞춰주기.
        h = self.conv1(h).relu()
        h = self.maxpool(h)
        h = self.conv2(h).relu()
        h = h.view(h.size(0),-1) # 결과를 1차원으로 펼치고, 선형함수에 넣기 위함.
        h = self.linear1(h).relu()
        h = self.dropout(h)
        h = self.linear2(h).sigmoid()
        return h

In [77]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DGCNN(train_dataset[0].num_features).to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)
criterion = BCEWithLogitsLoss()

In [78]:
def train():
    model.train()
    total_loss = 0

    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x,data.edge_index,data.batch)
        loss = criterion(out.view(-1),data.y.to(torch.float)) 
        loss.backward()
        optimizer.step()
        total_loss+=float(loss) * data.num_graphs
    
    return total_loss / len(train_dataset)


In [79]:
@torch.no_grad()
def test(loader):
    y_pred,y_true = [],[]

    for data in loader:
        data = data.to(device)
        out = model(data.x,data.edge_index,data.batch)
        y_pred.append(out.view(-1).cpu())
        y_true.append(data.y.view(-1).cpu().to(torch.float))
    
    acc = roc_auc_score(torch.cat(y_true),torch.cat(y_pred))
    ap = average_precision_score(torch.cat(y_true),torch.cat(y_pred))

    return acc,ap

In [80]:
for epoch in range(31):
    loss = train()
    val_auc, val_ap = test(val_loader)
    print(f'Epoch : {epoch:>2} | Loss : {loss:.4f} | val_auc : {val_auc:.4f} | val_ap : {val_ap:.4f}')

Epoch :  0 | Loss : 0.7003 | val_auc : 0.8085 | val_ap : 0.7864
Epoch :  1 | Loss : 0.6235 | val_auc : 0.8574 | val_ap : 0.8670
Epoch :  2 | Loss : 0.5984 | val_auc : 0.8627 | val_ap : 0.8807
Epoch :  3 | Loss : 0.5962 | val_auc : 0.8686 | val_ap : 0.8850
Epoch :  4 | Loss : 0.5947 | val_auc : 0.8549 | val_ap : 0.8729
Epoch :  5 | Loss : 0.5956 | val_auc : 0.8450 | val_ap : 0.8555
Epoch :  6 | Loss : 0.5984 | val_auc : 0.8289 | val_ap : 0.8532
Epoch :  7 | Loss : 0.6018 | val_auc : 0.8088 | val_ap : 0.8355
Epoch :  8 | Loss : 0.5995 | val_auc : 0.8048 | val_ap : 0.8242
Epoch :  9 | Loss : 0.5964 | val_auc : 0.8339 | val_ap : 0.8430
Epoch : 10 | Loss : 0.5938 | val_auc : 0.8189 | val_ap : 0.8248
Epoch : 11 | Loss : 0.5914 | val_auc : 0.8329 | val_ap : 0.8317
Epoch : 12 | Loss : 0.5908 | val_auc : 0.8419 | val_ap : 0.8342
Epoch : 13 | Loss : 0.5894 | val_auc : 0.8481 | val_ap : 0.8423
Epoch : 14 | Loss : 0.5866 | val_auc : 0.8408 | val_ap : 0.8349
Epoch : 15 | Loss : 0.5854 | val_auc : 0

In [81]:
test_auc, test_ap = test(test_loader)
print(f'Test_acu : {test_auc:.4f} | test_ap : {test_ap:.4f}')

Test_acu : 0.8381 | test_ap : 0.8281
