In [None]:
import torch
print(torch.__version__)

In [None]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.5.1+cu124.html

In [None]:
!pip install torch_geometric

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from torch_geometric.nn import Node2Vec
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from node_embeddings import train
from torch_geometric.datasets import Actor
from tqdm import tqdm
from torch.optim import SparseAdam

In [None]:
data = Actor('../data')
data

In [None]:
dataset = data[0]
dataset

In [None]:
labels = dataset.y.detach().cpu().numpy()
labels

In [None]:
def train(model, epochs=5, batch_size=32, lr=0.01, device='cpu'):
  model = model.to(device)
  loader = model.loader(batch_size=batch_size, shuffle=True, num_workers=1)
  optimizer = SparseAdam(list(model.parameters()), lr=lr)

  model.train()

  for epoch in range(epochs):
    train_loss = 0

    for pos_rw, neg_rw in tqdm(loader):
      optimizer.zero_grad()

      loss = model.loss(pos_rw.to(device), neg_rw.to(device))
      loss.backward()

      optimizer.step()

      train_loss += loss.item()

    train_loss = train_loss / len(loader)

    print(f'Epoch: {epoch:02d}, Loss: {train_loss:.4f}')

In [None]:
model = Node2Vec(dataset.edge_index,
                 embedding_dim=64,
                 walk_length=30,
                 context_size=15,
                 walks_per_node=20,
                 num_negative_samples=5,
                 p=2, q=0.5, sparse=True)

In [None]:
train(model, epochs=100, batch_size=64, lr=0.001, device="cuda")

In [None]:
node_embeddings = model().detach().cpu().numpy()

In [None]:
train_x, test_x, train_y, test_y = train_test_split(node_embeddings,
                                                    labels,
                                                    test_size=0.2,
                                                    stratify=labels)

In [None]:
random_forest = RandomForestClassifier()
random_forest.fit(train_x, train_y)

In [None]:
preds = random_forest.predict(test_x)

In [None]:
accuracy_score(preds, test_y)

In [None]:
print(classification_report(preds, test_y))

In [None]:
tsne = TSNE(n_components=2)
node_embeddings_2d = tsne.fit_transform(node_embeddings)
plt.scatter(node_embeddings_2d[:, 0], node_embeddings_2d[:, 1],
            c=labels, cmap='jet', alpha=0.7)