## Importando os grafos e csv necessários e bibliotecas

In [57]:
import networkx as nx
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
import random
from torch_geometric.nn import GAE, GCNConv
import torch.nn as nn
from torch import nn
import torch_geometric.transforms as T
import torch.optim as optim
import torch 
from torch_geometric import seed_everything
from torch_geometric.utils.convert import from_networkx
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_recall_curve
from torch_geometric.utils import negative_sampling

G = nx.read_gml('GraphMissingEdges.gml')

edges_to_evaluate = pd.read_csv('edgesToEvaluate.csv')

In [58]:
seed_everything(69)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

categories = pd.read_csv('categories.csv', index_col='CategoryId')
num_categorias = len(categories)

def bag_of_words(categories_list):
    # print(categories_list)
    categorias = categories_list.split(',')
    bag = np.zeros(num_categorias, dtype=np.float32)

    for i, categoria in enumerate(categorias):
        try:
            bag[i] = 1
        except IndexError:
            pass
    return bag

G_pyg = G.copy()


In [None]:
# Medidas de centralidade para serem usadas como features - piorou os resultados

# Centralidade de intermediação, vemos o quanto um nó atua como uma ponte ou intermediário entre outros pares de nós 
# no grafo (pontos de estrangulamento).
betweenness = nx.betweenness_centrality(G)

# Centralidade de grau verifica quais nós são aqueles que tem muitas conexões diretas com outros nós.
degree_centrality = nx.degree_centrality(G)

# Centralidade de proximidade mostra o quão próximo um nó está dos demais (sua distância média para todos 
# os outros nós no grafo)
closeness = nx.closeness_centrality(G)


In [None]:
# Teste usando comunidades como feature - resultados pioraram muito, não usar

# import community.community_louvain as community_louvain
# import matplotlib.cm as cm
# from collections import defaultdict

# partition = community_louvain.best_partition(G)

## Criação do dataset

In [59]:
# Criando o dataset a partir do grafo do desafio
# Aqui definimos os parametros que serão relevantes (rating, reviewCount e categories)

key_to_index = {}
for index, (n, data) in enumerate(G_pyg.nodes.data()):
    key_to_index[n] = index
    G_pyg.nodes[n]['stars'] = np.array([float(data['stars'])])
    G_pyg.nodes[n]['reviewCount'] = np.array([float(data['reviewCount'])])
    G_pyg.nodes[n]['categories'] = bag_of_words(data['categories'])
    # G_pyg.nodes[n]['betweenness'] = betweenness[n]
    # G_pyg.nodes[n]['degree_centrality'] = degree_centrality[n]

# dataset = from_networkx(G_pyg, group_edge_attrs=all,group_node_attrs=['categories','reviewCount','stars','closeness','betweenness','degree_centrality'])
dataset = from_networkx(G_pyg, group_edge_attrs=all,group_node_attrs=['categories','reviewCount','stars'])

## Definição da GAE de duas camadas, ativação com Relu na GCN, otimizador Adam, learning rate = 0.001 e 100 épocas

In [60]:
# edge_index - representa a conectividade do grafo no formato [2, num_edges]
# edge_attr - matriz de features de aresta com o formato [num_edges, num_edges_features]

# parâmetros
out_channels = 128
num_features = dataset.num_features

epochs = 100

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv2 = GCNConv(2 * out_channels, out_channels, cached=True)

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Instancia o modelo - Graph Auto-Encoder (GAE)
model = GAE(GCNEncoder(num_features, out_channels))
model = model.to(device)

# otimizador e critério
# criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [61]:
dataset.train_mask = None
dataset.val_mask = None
dataset.test_mask = None

# Usando add_negative_train_samples=True para adicionar exemplos negativos (exemplos onde nao tem link)
transform = T.Compose([
    T.NormalizeFeatures(),
    T.RandomLinkSplit(is_undirected=True, add_negative_train_samples=True, split_labels=True),
])

train_data, val_data, test_data = transform(dataset)
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

print(train_data)
print('-----')
print(test_data)
print()
print(train_data)
print(val_data)

Data(edge_index=[2, 26588], longitude=[4575], latitude=[4575], name=[4575], x=[4575, 895], edge_attr=[26588, 1], pos_edge_label=[13294], pos_edge_label_index=[2, 13294], neg_edge_label=[13294], neg_edge_label_index=[2, 13294])
-----
Data(edge_index=[2, 30386], longitude=[4575], latitude=[4575], name=[4575], x=[4575, 895], edge_attr=[30386, 1], pos_edge_label=[3798], pos_edge_label_index=[2, 3798], neg_edge_label=[3798], neg_edge_label_index=[2, 3798])

Data(edge_index=[2, 26588], longitude=[4575], latitude=[4575], name=[4575], x=[4575, 895], edge_attr=[26588, 1], pos_edge_label=[13294], pos_edge_label_index=[2, 13294], neg_edge_label=[13294], neg_edge_label_index=[2, 13294])
Data(edge_index=[2, 26588], longitude=[4575], latitude=[4575], name=[4575], x=[4575, 895], edge_attr=[26588, 1], pos_edge_label=[1899], pos_edge_label_index=[2, 1899], neg_edge_label=[1899], neg_edge_label_index=[2, 1899])


In [62]:
x = train_data.x.to(device).to(torch.float)
train_pos_edge_index = train_data.pos_edge_label_index.to(device)

def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)

    loss = model.recon_loss(z, train_pos_edge_index)
    loss.backward()
    optimizer.step()
    
    return float(loss)

def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
        out = model.decode(z, train_pos_edge_index)
        return model.test(z, pos_edge_index, neg_edge_index)

# Treino
for epoch in range(1, epochs + 1):
    loss = train()
    auc, ap = test(val_data.pos_edge_label_index, val_data.neg_edge_label_index)
    print('Época: {:}, Perda: {:}, AUC: {:}, AP: {:}'.format(epoch, loss, auc, ap))   

# Teste
auc, ap = test(test_data.pos_edge_label_index, test_data.neg_edge_label_index)
print('Area Under the Curve: {:}, AP: {:}'.format( auc, ap))

Época: 1, Perda: 1.243363618850708, AUC: 0.8577430930777292, AP: 0.8673333922124224
Época: 2, Perda: 1.2208515405654907, AUC: 0.8578520720281538, AP: 0.8673924131398186
Época: 3, Perda: 1.2049674987792969, AUC: 0.8581920419854578, AP: 0.8676584675957029
Época: 4, Perda: 1.200727939605713, AUC: 0.8585966228726575, AP: 0.8680539555991215
Época: 5, Perda: 1.1999049186706543, AUC: 0.859037252776537, AP: 0.868554123725101
Época: 6, Perda: 1.1988229751586914, AUC: 0.859436010361042, AP: 0.8690432270721793
Época: 7, Perda: 1.1893103122711182, AUC: 0.8595747990752596, AP: 0.869369513251424
Época: 8, Perda: 1.1807363033294678, AUC: 0.8593840165869844, AP: 0.8695271630272985
Época: 9, Perda: 1.1740825176239014, AUC: 0.8588165218744046, AP: 0.8695496219526158
Época: 10, Perda: 1.1695878505706787, AUC: 0.8575976491604324, AP: 0.8692254014146725
Época: 11, Perda: 1.165425181388855, AUC: 0.8557773124681625, AP: 0.8686651251597115
Época: 12, Perda: 1.1623303890228271, AUC: 0.8538210987130224, AP: 0.8

In [63]:
# Decodificação do teste expandida
temp_positivos = model.decode(test_data.x, test_data.pos_edge_label_index)
temp_negativos = model.decode(test_data.x, test_data.neg_edge_label_index)

# resultados
results = torch.cat([temp_positivos, temp_negativos]).cpu()
labels = torch.cat([test_data.pos_edge_label, test_data.neg_edge_label]).cpu()

precision, recall, thresholds = precision_recall_curve(labels, results)
f1_scores = 2*recall*precision/(recall+precision)
best_threshold = thresholds[np.argmax(f1_scores)]
threshold = best_threshold

evaluation_edges_u = []
evaluation_edges_v = []
for _, linkID, u, v in edges_to_evaluate.to_records():
    u_idx = key_to_index[u]
    v_idx = key_to_index[v]
    evaluation_edges_u.append(key_to_index[u])
    evaluation_edges_v.append(key_to_index[v])

edges_to_evaluate_tensor = torch.tensor([evaluation_edges_u, evaluation_edges_v])

# decoder
temp = model.decode(test_data.x, edges_to_evaluate_tensor)

In [64]:
# Atribui 0 ou 1 de acordo com o threshold
resultados = (temp > threshold).int()
# detach pra cpu pra poder usar o pandas
resultados = resultados.cpu().detach()

concatenados = pd.concat([edges_to_evaluate, pd.Series(resultados, name='link')], axis=1)
links = concatenados[['linkID', 'link']]
links.to_csv('teste_gae.csv', columns=['linkID', 'link'],index=False)