# GraphSAGE를 활용한 링크 예측 (DGL)

In [1]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import pandas as pd
import scipy.sparse as sp

Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


DGL backend not selected or invalid.  Assuming PyTorch for now.


In [2]:
nodes_data = pd.read_csv('../data/dgl_nodes.csv')
edges_data = pd.read_csv('../data/dgl_edges.csv')
src = edges_data['Src'].to_numpy()
dst = edges_data['Dst'].to_numpy()
g = dgl.graph((src, dst))
club = nodes_data['Club'].to_list()
# Convert to categorical integer values with 0 for 'Mr. Hi', 1 for 'Officer'.
club = torch.tensor([c == 'Officer' for c in club]).long()
# We can also convert it to one-hot encoding.
club_onehot = F.one_hot(club)
g.ndata.update({'club' : club, 'club_onehot' : club_onehot})


In [4]:
nodes_data.head(3)

Unnamed: 0,Id,Club,Age
0,0,Mr. Hi,45
1,1,Mr. Hi,33
2,2,Mr. Hi,36


In [5]:
edges_data.head(3)

Unnamed: 0,Src,Dst,Weight
0,0,1,0.318451
1,0,2,0.551215
2,0,3,0.227416


In [6]:
g

Graph(num_nodes=34, num_edges=156,
      ndata_schemes={'club': Scheme(shape=(), dtype=torch.int64), 'club_onehot': Scheme(shape=(2,), dtype=torch.int64)}
      edata_schemes={})

## 모델링

### 데이터
- positive, negative 2가지 edge
- positive는 데이터에 존재하는 edge, negative는 여기서는 결측 edge를 사용한다. 수가 적어서 negative sampling등은 하지 않는다.

In [9]:
# 학습/테스트 셋
u, v = g.edges()
eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
# 50개는 테스트로 사용
test_pos_u, test_pos_v = u[eids[:50]], v[eids[:50]]
train_pos_u, train_pos_v = u[eids[50:]], v[eids[50:]]

In [10]:
# 모든 negative 엣지를 찾아 학습과 테스트용으로 분할
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(34)
neg_u, neg_v = np.where(adj_neg != 0)
neg_eids = np.random.choice(len(neg_u), 200)
test_neg_u, test_neg_v = neg_u[neg_eids[:50]], neg_v[neg_eids[:50]]
train_neg_u, train_neg_v = neg_u[neg_eids[50:]], neg_v[neg_eids[50:]]

In [11]:
# train set
train_u = torch.cat([torch.as_tensor(train_pos_u), torch.as_tensor(train_neg_u)])
train_v = torch.cat([torch.as_tensor(train_pos_v), torch.as_tensor(train_neg_v)])
train_label = torch.cat([torch.ones(len(train_pos_u)), torch.zeros(len(train_neg_u))])

# test set
test_u = torch.cat([torch.as_tensor(test_pos_u), torch.as_tensor(test_neg_u)])
test_v = torch.cat([torch.as_tensor(test_pos_v), torch.as_tensor(test_neg_v)])
test_label = torch.cat([torch.ones(len(test_pos_u)), torch.zeros(len(test_neg_u))])

In [16]:
from dgl.nn import SAGEConv

# 2개의 레이어를 가진 GraphSAGE 모델
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats, num_of_nodes):
        super(GraphSAGE, self).__init__()

        node_embed = nn.Embedding(num_of_nodes, 5)
        self.inputs = node_embed.weight
        nn.init.xavier_uniform_(self.inputs)
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
    
    def forward(self, g):
        h = self.conv1(g, self.inputs)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    
net = GraphSAGE(5, 16, g.number_of_nodes())

In [18]:
optimizer = torch.optim.Adam(itertools.chain(net.parameters(), node_embed.parameters()), lr=0.01)

all_logits = []
for e in range(100):
    logits = net(g)
    pred = torch.sigmoid((logits[train_u] * logits[train_v]).sum(dim=1))
    
    loss = F.binary_cross_entropy(pred, train_label)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    all_logits.append(logits.detach())
    
    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

In epoch 0, loss: 3.9016380310058594
In epoch 5, loss: 0.8958908319473267
In epoch 10, loss: 0.7144952416419983
In epoch 15, loss: 0.6871557831764221
In epoch 20, loss: 0.6322401165962219
In epoch 25, loss: 0.5926050543785095
In epoch 30, loss: 0.5619769096374512
In epoch 35, loss: 0.536224901676178
In epoch 40, loss: 0.5152873992919922
In epoch 45, loss: 0.4937884211540222
In epoch 50, loss: 0.46755242347717285
In epoch 55, loss: 0.4392321705818176
In epoch 60, loss: 0.41977453231811523
In epoch 65, loss: 0.3972312808036804
In epoch 70, loss: 0.3725888431072235
In epoch 75, loss: 0.35219869017601013
In epoch 80, loss: 0.3317413032054901
In epoch 85, loss: 0.3118561804294586
In epoch 90, loss: 0.2900824546813965
In epoch 95, loss: 0.2702104151248932
