## Importando os grafos e csv necessários e bibliotecas

In [106]:
import networkx as nx
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
import random
from torch_geometric.nn import GAE, GCNConv
import torch.nn as nn
from torch import nn
import torch_geometric.transforms as T
import torch.optim as optim
import torch 
from torch_geometric import seed_everything
from torch_geometric.utils.convert import from_networkx
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_recall_curve
from torch_geometric.utils import negative_sampling

G = nx.read_gml('GraphMissingEdges.gml')

edges_to_evaluate = pd.read_csv('edgesToEvaluate.csv')

In [107]:
seed_everything(69)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

categories = pd.read_csv('categories.csv', index_col='CategoryId')
num_categorias = len(categories)

def bag_of_words(categories_list):
    # print(categories_list)
    categorias = categories_list.split(',')
    bag = np.zeros(num_categorias, dtype=np.float32)

    for i, categoria in enumerate(categorias):
        try:
            bag[i] = 1
        except IndexError:
            pass
    return bag

G_pyg = G.copy()

## Criação do dataset

In [108]:
# Criando o dataset a partir do grafo do desafio
# Aqui definimos os parametros que serão relevantes (rating, reviewCount e categories)

key_to_index = {}
for index, (n, data) in enumerate(G_pyg.nodes.data()):
    key_to_index[n] = index
    G_pyg.nodes[n]['stars'] = np.array([float(data['stars'])])
    G_pyg.nodes[n]['reviewCount'] = np.array([float(data['reviewCount'])])
    G_pyg.nodes[n]['categories'] = bag_of_words(data['categories'])

dataset = from_networkx(G_pyg, group_edge_attrs=all,group_node_attrs=['categories','reviewCount','stars'])

## Definição da GAE de duas camadas, ativação com Relu na GCN, otimizador Adam, learning rate = 0.001 e 100 épocas

In [109]:
# edge_index - representa a conectividade do grafo no formato [2, num_edges]
# edge_attr - matriz de features de aresta com o formato [num_edges, num_edges_features]

# parâmetros
out_channels = 128
num_features = dataset.num_features

epochs = 100

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv2 = GCNConv(2 * out_channels, out_channels, cached=True)

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Instancia o modelo - Graph Auto-Encoder (GAE)
model = GAE(GCNEncoder(num_features, out_channels))
model = model.to(device)

# otimizador e critério
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [110]:
dataset.train_mask = None
dataset.val_mask = None
dataset.test_mask = None

# Usando add_negative_train_samples=True para adicionar exemplos negativos (exemplos onde nao tem link)
transform = T.Compose([
    T.NormalizeFeatures(),
    T.RandomLinkSplit(is_undirected=True, add_negative_train_samples=True, split_labels=True),
])

# Usar esse caso queira negativos mas com valores fixos de teste
# transform = T.Compose([
#     T.NormalizeFeatures(),
#     T.RandomLinkSplit(num_val=0.10, num_test=0.10, neg_sampling_ratio = 1.0,
#                   add_negative_train_samples=True, is_undirected=True, split_labels=True),
# ])

# Usar esse caso nao queira negativos
# transform = T.Compose([
#     T.NormalizeFeatures(),
#     T.RandomLinkSplit(num_val=0.10, num_test=0.10, neg_sampling_ratio = 1.0,
#                   is_undirected=True, split_labels=True),
# ])

train_data, val_data, test_data = transform(dataset)
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)

print(train_data)
print('-----')
print(test_data)
print()
print(train_data)
print(val_data)

Data(edge_index=[2, 26588], longitude=[4575], latitude=[4575], name=[4575], x=[4575, 895], edge_attr=[26588, 1], pos_edge_label=[13294], pos_edge_label_index=[2, 13294], neg_edge_label=[13294], neg_edge_label_index=[2, 13294])
-----
Data(edge_index=[2, 30386], longitude=[4575], latitude=[4575], name=[4575], x=[4575, 895], edge_attr=[30386, 1], pos_edge_label=[3798], pos_edge_label_index=[2, 3798], neg_edge_label=[3798], neg_edge_label_index=[2, 3798])

Data(edge_index=[2, 26588], longitude=[4575], latitude=[4575], name=[4575], x=[4575, 895], edge_attr=[26588, 1], pos_edge_label=[13294], pos_edge_label_index=[2, 13294], neg_edge_label=[13294], neg_edge_label_index=[2, 13294])
Data(edge_index=[2, 26588], longitude=[4575], latitude=[4575], name=[4575], x=[4575, 895], edge_attr=[26588, 1], pos_edge_label=[1899], pos_edge_label_index=[2, 1899], neg_edge_label=[1899], neg_edge_label_index=[2, 1899])


In [111]:
x = train_data.x.to(device).to(torch.float)
train_pos_edge_index = train_data.pos_edge_label_index.to(device)

def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)

    # *** Nova rodada de amostragem negativa para cada época de treinamento (tirar junto com o negativo do random split se piorar):
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.neg_edge_label_index.size(1))

    edge_label_index = torch.cat(
        [train_data.neg_edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.pos_edge_label,
        train_data.pos_edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    # ***

    # Usar esse caso nao queira negativos
    # loss = model.recon_loss(z, train_pos_edge_index)
    loss.backward()
    optimizer.step()
    
    return float(loss)

def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
        out = model.decode(z, train_pos_edge_index)
        return model.test(z, pos_edge_index, neg_edge_index)

# Treino
for epoch in range(1, epochs + 1):
    loss = train()
    auc, ap = test(val_data.pos_edge_label_index, val_data.neg_edge_label_index)
    print('Época: {:}, Perda: {:}, AUC: {:}, AP: {:}'.format(epoch, loss, auc, ap))   

# Teste
auc, ap = test(test_data.pos_edge_label_index, test_data.neg_edge_label_index)
print('Area Under the Curve: {:}, AP: {:}'.format( auc, ap))

Época: 1, Perda: 0.7287930250167847, AUC: 0.85856098980617, AP: 0.8681677631587922
Época: 2, Perda: 0.7272034883499146, AUC: 0.8589632136422789, AP: 0.8687165944036637
Época: 3, Perda: 0.7260039448738098, AUC: 0.8586518055981904, AP: 0.8689394869403557
Época: 4, Perda: 0.7252641916275024, AUC: 0.8560071942745288, AP: 0.8682272972283761
Época: 5, Perda: 0.7247818112373352, AUC: 0.847363887925271, AP: 0.864861781212596
Época: 6, Perda: 0.7244954109191895, AUC: 0.8332683064532455, AP: 0.8572163360839347
Época: 7, Perda: 0.7243427038192749, AUC: 0.810529418631962, AP: 0.838751642106423
Época: 8, Perda: 0.724311888217926, AUC: 0.7819030331365334, AP: 0.8083266730320153
Época: 9, Perda: 0.7243171334266663, AUC: 0.7572186076150498, AP: 0.7769839709595946
Época: 10, Perda: 0.7243549227714539, AUC: 0.7419690416590756, AP: 0.7550283920188957
Época: 11, Perda: 0.7243688106536865, AUC: 0.7352345307430173, AP: 0.7444079328588403
Época: 12, Perda: 0.7243887782096863, AUC: 0.7340921928644577, AP: 0.7

In [112]:
# Decodificação do teste expandida
temp_positivos = model.decode(test_data.x, test_data.pos_edge_label_index)
temp_negativos = model.decode(test_data.x, test_data.neg_edge_label_index)

# resultados
results = torch.cat([temp_positivos, temp_negativos]).cpu()
labels = torch.cat([test_data.pos_edge_label, test_data.neg_edge_label]).cpu()

precision, recall, thresholds = precision_recall_curve(labels, results)
f1_scores = 2*recall*precision/(recall+precision)
best_threshold = thresholds[np.argmax(f1_scores)]
threshold = best_threshold

evaluation_edges_u = []
evaluation_edges_v = []
for _, linkID, u, v in edges_to_evaluate.to_records():
    u_idx = key_to_index[u]
    v_idx = key_to_index[v]
    evaluation_edges_u.append(key_to_index[u])
    evaluation_edges_v.append(key_to_index[v])

edges_to_evaluate_tensor = torch.tensor([evaluation_edges_u, evaluation_edges_v])

# decoder
temp = model.decode(test_data.x, edges_to_evaluate_tensor)

In [113]:
# Atribui 0 ou 1 de acordo com o threshold
resultados = (temp > threshold).int()
# detach pra cpu pra poder usar o pandas
resultados = resultados.cpu().detach()

concatenados = pd.concat([edges_to_evaluate, pd.Series(resultados, name='link')], axis=1)
links = concatenados[['linkID', 'link']]
links.to_csv('results_gae_teste5.csv', columns=['linkID', 'link'],index=False)