In [72]:
from functools import cached_property

import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from networkx.classes.graph import NodeView
from geopy.distance import geodesic
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from tqdm import trange

## Carrega rede

In [73]:
graph = nx.read_gml("data/GraphMissingEdges.gml")

## Funções complementares

In [74]:
def extract_features(node1, node2, label):
    def _build_distance_feature(node1_attrs: dict, node2_attrs: dict) -> np.ndarray:
        lon1, lat1 = float(node1_attrs["longitude"]), float(node1_attrs["latitude"])
        lon2, lat2 = float(node2_attrs["longitude"]), float(node2_attrs["latitude"])
        coords1, coords2 = (lat1, lon1), (lat2, lon2)
        distance = float(f"{geodesic(coords1, coords2).kilometers:.2f}")
        return np.array([distance])

    def _build_category_features(node1: NodeView, node2: NodeView) -> np.ndarray:
        def _build_categories(node: NodeView) -> list[int]:
            return [
                int(c) if c else 0 for c in graph.nodes[node]["categories"].split(",")
            ]

        node1_categories = set(_build_categories(node1))
        node2_categories = set(_build_categories(node2))

        intersection = [float(len(node1_categories.intersection(node2_categories)))]
        union = [float(len(node1_categories.union(node2_categories)))]

        return np.concatenate([np.array(intersection), np.array(union)])

    def _build_feature(attr: str, node1_attrs: dict, node2_attrs: dict) -> np.ndarray:
        return np.concatenate(
            [np.array([float(node1_attrs[attr])]), np.array([float(node2_attrs[attr])])]
        )

    def _build_edge_weight_feature(
        label: int, node1: NodeView, node2: NodeView
    ) -> np.ndarray:
        edge_weight = graph[node1][node2]["weight"] if label else 0
        if edge_weight >= 4:
            return np.array([1])
        return np.array([0])

    node1_attrs = graph.nodes()[node1]
    node2_attrs = graph.nodes()[node2]
    return np.concatenate(
        [
            _build_distance_feature(node1_attrs=node1_attrs, node2_attrs=node2_attrs),
            _build_category_features(node1=node1, node2=node2),
            _build_feature(
                attr="stars", node1_attrs=node1_attrs, node2_attrs=node2_attrs
            ),
            _build_feature(
                attr="reviewCount", node1_attrs=node1_attrs, node2_attrs=node2_attrs
            ),
            _build_edge_weight_feature(label=label, node1=node1, node2=node2),
        ]
    )

## Definição do dataset

In [75]:
class LinkPredictionDataset(Dataset):
    def __init__(self, graph):
        self.graph = graph
        self.edges = list(graph.edges())

    def __len__(self):
        return len(self.edges) * 2

    def __getitem__(self, idx):
        if idx < len(self.edges):
            node1, node2 = self.edges[idx]
            label = int(self.graph.has_edge(node1, node2))
        else:
            node1, node2 = self.random_node, self.random_node
            while self.graph.has_edge(node1, node2):
                node1, node2 = self.random_node, self.random_node
            label = 0

        features = extract_features(node1=node1, node2=node2, label=label)
        return features, label

    @cached_property
    def nodes_numpy(self) -> np.ndarray:
        return np.array(list(self.graph.nodes()))

    @property
    def random_node(self) -> str:
        return np.random.choice(self.nodes_numpy)

## Definição do modelo de predição

In [76]:
class LinkPredictionModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return self.sigmoid(x)

## Inicializar modelo


In [77]:
# Definição dos hiperparametros
input_size = 8
hidden_size = 32
learning_rate = 0.001
num_epochs = 1000

model = LinkPredictionModel(input_size, hidden_size)

criterion = nn.BCELoss()
optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

## Testar acurácia do modelo

In [78]:
X = []  # Lista para armazenar as features
y = []  # Lista para armazenar as labels

dataset = LinkPredictionDataset(graph)
for idx in range(len(dataset)):
    features, label = dataset[idx]
    X.append(features)
    y.append(label)

features_train, features_test, labels_train, labels_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Converter os conjuntos de treinamento e teste em tensores
features_train_tensor = torch.Tensor(features_train)
labels_train_tensor = torch.FloatTensor(labels_train)

features_test_tensor = torch.Tensor(features_test)
labels_test_tensor = torch.FloatTensor(labels_test)

t = trange(num_epochs, leave=True)
# Loop de treinamento
for epoch in t:
    # Forward pass
    outputs = model(features_train_tensor)

    # Calcular a perda
    loss = criterion(outputs.squeeze(), labels_train_tensor)

    # Backward pass e otimização
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    t.set_description(desc=f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

# Avaliação da acurácia nos dados de teste
with torch.no_grad():
    outputs = model(features_test_tensor)
    predicted_labels = outputs.round().squeeze().detach().numpy()

accuracy = accuracy_score(labels_test, predicted_labels)
print(f"Acurácia: {accuracy}")

Epoch [1000/1000], Loss: 0.3998: 100%|██████████| 1000/1000 [00:09<00:00, 105.97it/s]


Acurácia: 0.8230880610767408


## Carregamento do dataset a ser avaliado

In [79]:
edges_to_evaluate = pd.read_csv("data/edgesToEvaluate.csv")

## Predição

In [80]:
link_ids = []
with torch.no_grad():
    all_features = []
    for _, row in edges_to_evaluate.iterrows():
        link_ids.append(row["linkID"])
        node1 = row["venue1"]
        node2 = row["venue2"]

        features = extract_features(node1, node2, 0)
        all_features.append(features)

    all_features = torch.FloatTensor(all_features)
    predictions = model(all_features).squeeze().numpy()

print(predictions)

[0.9874107  0.6181429  0.9746782  0.61966985 0.06629679 0.984844
 0.36490288 0.97806346 0.17099808 0.16915114 0.9899166  0.7645481
 0.04866929 0.32022107 0.38143677 0.7599189  0.4146239  0.06229569
 0.07674591 0.04104792 0.984814   0.98470944 0.6712623  0.9609078
 0.7631246  0.05218807 0.34813616 0.7058132  0.7450561  0.9991074
 0.8721216  0.91363555 0.12207973 0.98179173 0.03050127 0.90042627
 0.9921124  0.62511253 0.09017267 0.80409163 0.08857162 0.16705869
 0.913858   0.9143726  0.23212096 0.2719546  0.10080393 0.04132877
 0.29090163 0.92222387 0.3676265  0.06104619 0.94534415 0.97010595
 0.77164936 0.16733882 0.7910885  0.13733433 0.13744931 0.10616941
 0.8822416  0.3988203  0.9106541  0.88013244 0.07554998 0.9004024
 0.8947046  0.4384843  0.995169   0.11647782 0.05748113 0.24233992
 0.13659549 0.01865614 0.9971296  0.7088828  0.99976486 0.8506812
 0.14838369 0.4824343  0.6327867  0.03690485 0.97230035 0.4936516
 0.0236379  0.9834937  0.95413184 0.83994615 1.         0.25002974
 0.

## Escrita das predições no CSV

In [58]:
predicted_links = []
for pred in np.round(predictions):
    predicted_links.append(int(pred))

results = pd.DataFrame({"linkID": link_ids, "link": predicted_links})
results.to_csv("resultados.csv", index=False)