# Link prediction - aleatório com random baselines

In [6]:
import numpy as np
import pandas as pd

dfEdgesToEvaluate = pd.read_csv('edgesToEvaluate.csv')

random_predictions = np.random.choice([0, 1], size=dfEdgesToEvaluate.shape[0])
dfEdgesToEvaluate['link'] = random_predictions

dfEdgesToEvaluate

Unnamed: 0,linkID,venue1,venue2,link
0,9,mJ_ucQ2_3hfTsmCcKb-hgw,qXGKYRwCR9SLgLl0g_9o5g,0
1,135,y19xFolCozaRA-gGmHwkQA,F6c3D1o9Z4Tl6cDorb3WgA,0
2,434,R1GwW4C1gh2Nmue9K0WYVA,Ul6JwluSTm12PVDIqnNaTg,0
3,262,zzBa0pQjM1gov00bXjYYXg,3D6Uck9QSdxZKFstf5DGlg,1
4,383,U2d-meX4sVq0kiqcrpHt1w,vuDL_d3GYAtbvX9EJQqVog,0
...,...,...,...,...
495,225,aSb4vkaMh7K2lHhnV2UIag,bQ-sXUqPSr4-iJfB764Nzw,1
496,288,6p39JCOx1L054G9jM10-5g,xwEYTGJ_82ScbpXcheqqQw,0
497,348,6WItftahZ9lNFJxfDPSJ0Q,XPmZnhnx0YeN8Xvo7y2xsA,0
498,187,GUriQoD_GHo6DNJlR1_CrA,1w6_xrdhVD-y-DBYpv0YCQ,0


In [7]:
dfEdgesToEvaluate.to_csv("randomTeste.csv", columns=['linkID','link'],index=False)

# Link prediction - com GAE (Graph AutoEncoder)

In [41]:
from torch import nn
import torch_geometric.transforms as T
import torch.optim as optim
from torch_geometric.datasets import Planetoid
import torch 
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling
from torch.nn import ReLU
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import numpy as np
import networkx as nx
from torch_geometric.utils import from_networkx

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Importando o grafo do desafio como um dataset
G = nx.read_gml('GraphMissingEdges.gml')

for n in G.nodes( ):
    G.nodes[n]['stars'] = float(G.nodes[n]['stars' ])
    G.nodes[n]['reviewCount'] = int(G.nodes[n]['reviewCount'])

data = from_networkx(G,group_node_attrs=['longitude','latitude','stars','reviewCount'])

print(data)
print()


Data(edge_index=[2, 37575], categories=[4575], name=[4575], weight=[37575], x=[4575, 4])



In [42]:
data.train_mask = data.val_mask = data.test_mask = None

transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.10, num_test=0.10, neg_sampling_ratio = 1.0,
                  is_undirected=True, add_negative_train_samples=False)
])

train_data, val_data, test_data = transform(data)
train_data = train_data.to(device)
val_data=val_data.to(device)
test_data = test_data.to(device)


print(train_data)
print('-----')
print(test_data)
print()
print(train_data)
print(val_data)

Data(edge_index=[2, 30386], categories=[4575], name=[4575], weight=[30386], x=[4575, 4], edge_label=[15193], edge_label_index=[2, 15193])
-----
Data(edge_index=[2, 34184], categories=[4575], name=[4575], weight=[34184], x=[4575, 4], edge_label=[3798], edge_label_index=[2, 3798])

Data(edge_index=[2, 30386], categories=[4575], name=[4575], weight=[30386], x=[4575, 4], edge_label=[15193], edge_label_index=[2, 15193])
Data(edge_index=[2, 30386], categories=[4575], name=[4575], weight=[30386], x=[4575, 4], edge_label=[3798], edge_label_index=[2, 3798])


In [43]:
# Define encoder

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels) 
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)
    
    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

In [44]:
# parâmetros
out_channels = 64
num_features = data.num_node_features

epochs = 100

# modelo - Graph Auto-Encoder (GAE)
model = GCNEncoder(num_features, out_channels)
model = model.to(device)

criterion = torch.nn.BCEWithLogitsLoss()

print(model)


# inicialização o optimizador
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

GCNEncoder(
  (conv1): GCNConv(4, 128)
  (conv2): GCNConv(128, 64)
)


In [45]:
def train(data):
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index)

    # Nova rodada de amostragem negativa para cada época de treinamento:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1))

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss


def test(data):
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x, data.edge_index)
    
    out = model.decode(z, data.edge_label_index).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

In [46]:
for epoch in range(1, epochs + 1):
    loss = train(train_data)

    auc = test(val_data)
    print('Epoch: {:03d}, loss: {:.4f}, AUC val: {:.4f}'.format(epoch, loss, auc))

Epoch: 001, loss: 0.6500, AUC val: 0.8357
Epoch: 002, loss: 0.6562, AUC val: 0.8418
Epoch: 003, loss: 0.6238, AUC val: 0.8491
Epoch: 004, loss: 0.6283, AUC val: 0.8542
Epoch: 005, loss: 0.6257, AUC val: 0.8559
Epoch: 006, loss: 0.6149, AUC val: 0.8556
Epoch: 007, loss: 0.6058, AUC val: 0.8569
Epoch: 008, loss: 0.6054, AUC val: 0.8608
Epoch: 009, loss: 0.5963, AUC val: 0.8666
Epoch: 010, loss: 0.5854, AUC val: 0.8718
Epoch: 011, loss: 0.5817, AUC val: 0.8741
Epoch: 012, loss: 0.5766, AUC val: 0.8745
Epoch: 013, loss: 0.5687, AUC val: 0.8743
Epoch: 014, loss: 0.5646, AUC val: 0.8756
Epoch: 015, loss: 0.5609, AUC val: 0.8767
Epoch: 016, loss: 0.5520, AUC val: 0.8599
Epoch: 017, loss: 0.5490, AUC val: 0.8545
Epoch: 018, loss: 0.5435, AUC val: 0.8649
Epoch: 019, loss: 0.5350, AUC val: 0.8703
Epoch: 020, loss: 0.5382, AUC val: 0.8516
Epoch: 021, loss: 0.5321, AUC val: 0.8246
Epoch: 022, loss: 0.5296, AUC val: 0.8248
Epoch: 023, loss: 0.5237, AUC val: 0.8440
Epoch: 024, loss: 0.5239, AUC val:

In [47]:
#Teste
auc = test(test_data)
print('test: {:.4f}'.format( auc))

test: 0.8517


In [48]:
# Decodificação do teste expandida

z = model.encode(test_data.x, test_data.edge_index)

temp = model.decode(z, test_data.edge_label_index).sigmoid()

threshold = torch.tensor([0.5]).to(device)

#Atribui 0 ou 1 de acordo com o threshold
results = (temp>threshold).float()

print(threshold)
print(f1_score(test_data.edge_label.cpu().numpy() ,results.cpu().numpy() ))

print()
print("Testando com mais limites de corte:")
print()

for i in np.arange(0.1, 0.9, 0.01):
    threshold = torch.tensor([i]).to(device)
    results = (temp>threshold).float()

    
    if (f1_score(test_data.edge_label.cpu().numpy() ,results.cpu().numpy() ) > 0.80):
        print(threshold)
        print(f1_score(test_data.edge_label.cpu().numpy() ,results.cpu().numpy() ))
        print('---')


tensor([0.5000])
0.7416958041958042

Testando com mais limites de corte:



In [15]:
results

tensor([0., 0., 0.,  ..., 0., 0., 0.])