# Embeddings

In this notebook, we train 2 types of embeddings (node2vec and Laplacian eigenmaps) and fit a logistic regression for the `ogbl-ddi` link prediction task. 

##### **It is recommended to download the pre-computed random walks [here](https://drive.google.com/file/d/1H8o26Yztwc3IRNDo6Ir8-qeZaG49vIqC/view?usp=sharing) to save time.**

In [1]:
import torch_geometric
import networkx as nx
import random
import os
import numpy as np
import torch
import pickle

from karateclub import LaplacianEigenmaps
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from torch_geometric.utils import negative_sampling

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
load_walks = os.path.exists('walks.pkl')

### Load dataset

In [2]:
from ogb.linkproppred import PygLinkPropPredDataset, Evaluator
dataset = PygLinkPropPredDataset(name = 'ogbl-ddi')
evaluator = Evaluator(name='ogbl-ddi')

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0]

In [3]:
print("Number of edges in the training set: ", train_edge['edge'].shape[0])
print("Number of edges in the validation set: ")
print("\t positive: ", valid_edge['edge'].shape[0])
print("\t negative: ", valid_edge['edge_neg'].shape[0])
print("Number of edges in the test set: ")
print("\t positive: ", test_edge['edge'].shape[0])
print("\t negative: ", test_edge['edge_neg'].shape[0])

Number of edges in the training set:  1067911
Number of edges in the validation set: 
	 positive:  133489
	 negative:  101882
Number of edges in the test set: 
	 positive:  133489
	 negative:  95599


In [4]:
dataset.meta_info

eval metric                                                        hits@20
task type                                                  link prediction
download_name                                                          ddi
version                                                                  1
url                      http://snap.stanford.edu/ogb/data/linkproppred...
add_inverse_edge                                                      True
has_node_attr                                                        False
has_edge_attr                                                        False
split                                                               target
additional node files                                                 None
additional edge files                                                 None
is hetero                                                            False
binary                                                               False
Name: ogbl-ddi, dtype: ob

### Node2Vec

In [5]:
from gensim.models import Word2Vec
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph

2022-06-09 13:06:03.197948: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
def Node2Vec(G, dimensions, walk_length, num_walks, p=1.0, q=1.0, seed=0, load_walks=True):
  '''G: NetworkX graph object
      dimensions: embedding dimensions
      walk_length: maximum length of each random walk
      num_walks: total number of random walks per root node
      q (float, > 0): In-out parameter; q controls the likelihood of moving away from source node.
        If q > 1, the RW is biased towards closeby points (BFS), better for capturing structural nodes
        If q < 1, the RW is biased towards faraway points (DFS), better for capturing communities
      p (float, > 0): Return parameter; p controls the likelihood of immediately revisiting a node in the walk.
        If p > max(q, 1)) makes it less likely to revisit a node in the next two steps (unless there is no other node available). 
        If p < min(q, 1)), it keeps the walk “local” (i.e. close to the starting node)
  '''
  if load_walks:
      print("Loading walks...")
      with open('walks.pkl', 'rb') as f:
        walks = pickle.load(f)
  else:
    rw = BiasedRandomWalk(G, seed=seed)

    walks = rw.run(nodes=list(G.nodes()), length=walk_length, n=num_walks, p=p, q=q, seed=seed)

    with open('walks.pkl', 'wb') as f:
        pickle.dump(walks, f)

    print("Done walking...")

  str_walks = [[str(n) for n in walk] for walk in walks]
  model = Word2Vec(
      str_walks, vector_size=dimensions, min_count=0, sg=1, workers=-1, seed=seed, epochs=100
  )

  return model

### Helpers

In [7]:
# Build the features: concatenation of node embeddings
def embedded_edgelist(model, edgelist, type='node2vec'):
    """ Returns the features which are the Hadamard product of the node embeddings.

    Params:
    model: Embeddings (gensim or numpy).
    edgelist (np.array): Node pairs.
    type (str): If 'node2vec', embeddings should be from `gensim`.
    
    Returns:
    np.array: `Nxd` feature vector.
    """
    edgelist = edgelist.numpy()
    if type == 'node2vec':
        return np.stack([(model.wv.get_vector(x[0])*model.wv.get_vector(x[1])).transpose() for x in edgelist])
    else:
        return np.stack([(model[x[0]]*model[x[1]]).transpose() for x in edgelist])
    
    
def get_train_data(graph, train_edges, d=32, type='node2vec', load_walks=True):
    """Prepares the training data.

    d (int): Embedding dimension.
    type (str): Which embedding model to fit.
    load_walks (bool): Use precomputed random walks from `walks.pkl`.
    
    Returns:
    emb (numpy or gensim): Embeddings
    feats (np.array): Training features.
    labels (np.array): Training labels.
    """
    g = torch_geometric.utils.to_networkx(graph, to_undirected=True)
    
    # Train the embedding
    if type == 'node2vec':
        emb = Node2Vec(StellarGraph.from_networkx(g), dimensions=d, walk_length=5, 
            num_walks=1000, p=1.0, q=0.5, load_walks=load_walks) 
    else:
        model = LaplacianEigenmaps(dimensions=d)
        model.fit(g) 
        emb = model.get_embedding()

    # Extract positive and negative samples
    num_edges = len(train_edges)
    neg_edges = negative_sampling(graph.edge_index, num_nodes=len(g.nodes),
                                    num_neg_samples=num_edges, method='dense')

    feats_pos = embedded_edgelist(emb, train_edges, type)
    feats_neg = embedded_edgelist(emb, neg_edges, type)
    feats = np.concatenate((feats_pos, feats_neg))
    labels = np.concatenate((np.repeat(1, feats_pos.shape[0]), np.repeat(0, feats_neg.shape[0])))
    return emb, feats, labels

In [8]:
def eval(clf, emb, type='node2vec', split_edge=split_edge, evaluator=evaluator):
    """ Evaluates the model.

    clf: Fitted regression model (scikit)
    emb: Trained embeddings (numpy or gensim)
    type: Which model we're evaluating. Defaults to 'node2vec'.
        If 'node2vec', embeddings should be  from gensim.
    split_edge: Train/validation/test split of the data.  
    evaluator: Evaluator for the `ogbl-ddi` task.  
    """
    feats_pos_val = embedded_edgelist(emb, split_edge['valid']['edge'], type)
    feats_neg_val = embedded_edgelist(emb, split_edge['valid']['edge_neg'], type)
    feats_pos_test = embedded_edgelist(emb, split_edge['test']['edge'], type)
    feats_neg_test = embedded_edgelist(emb, split_edge['test']['edge_neg'], type)
    feats_pos_train = embedded_edgelist(emb, split_edge['train']['edge'], type)

    preds_pos_val = clf.predict_proba(feats_pos_val).max(axis=1)
    preds_neg_val = clf.predict_proba(feats_neg_val).max(axis=1)
    preds_pos_test = clf.predict_proba(feats_pos_test).max(axis=1)
    preds_neg_test = clf.predict_proba(feats_neg_test).max(axis=1)
    preds_pos_train = clf.predict_proba(feats_pos_train).max(axis=1)

    for K in [10, 20, 30]:
        evaluator.K = K
        valid_hits = evaluator.eval({
                    'y_pred_pos': preds_pos_val,
                    'y_pred_neg': preds_neg_val,
                })[f'hits@{K}']
        print("val hits@{}: {:.5f}".format(K, valid_hits))
        test_hits = evaluator.eval({
                    'y_pred_pos': preds_pos_test,
                    'y_pred_neg': preds_neg_test,
                })[f'hits@{K}']
        print("test hits@{}: {:.5f}".format(K, test_hits))
        train_hits = evaluator.eval({
                'y_pred_pos': preds_pos_train,
                'y_pred_neg': preds_neg_val,
            })[f'hits@{K}']
        print("train hits@{}: {:.5f}".format(K, train_hits))
        print("\n")

#### Node2vec, $d=32$

In [9]:
emb, feats_train, labels_train = get_train_data(dataset[0], split_edge['train']['edge'], load_walks=load_walks)
clf = make_pipeline(StandardScaler(), LogisticRegression()).fit(feats_train, labels_train)
eval(clf, emb)

Loading walks...
val hits@10: 0.00010
test hits@10: 0.00016
train hits@10: 0.00010


val hits@20: 0.00022
test hits@20: 0.00025
train hits@20: 0.00019


val hits@30: 0.00034
test hits@30: 0.00033
train hits@30: 0.00031




#### Laplacian eigenmaps
##### $d=32$

In [10]:
emb, feats_train, labels_train = get_train_data(dataset[0], split_edge['train']['edge'], type='eigenmaps')
clf = make_pipeline(StandardScaler(), LogisticRegression()).fit(feats_train, labels_train)
eval(clf, emb, type="eigenmaps")

val hits@10: 0.01781
test hits@10: 0.04971
train hits@10: 0.02011


val hits@20: 0.02569
test hits@20: 0.05865
train hits@20: 0.02913


val hits@30: 0.03481
test hits@30: 0.07049
train hits@30: 0.03955




##### $d=64$

In [11]:
emb, feats_train, labels_train = get_train_data(dataset[0], split_edge['train']['edge'], d=64, type='eigenmaps')
clf = make_pipeline(StandardScaler(), LogisticRegression()).fit(feats_train, labels_train)
eval(clf, emb, type="eigenmaps")

val hits@10: 0.00000
test hits@10: 0.00000
train hits@10: 0.00000


val hits@20: 0.01463
test hits@20: 0.03833
train hits@20: 0.01660


val hits@30: 0.02428
test hits@30: 0.05552
train hits@30: 0.02758




##### $d=16$

In [13]:
emb, feats_train, labels_train = get_train_data(dataset[0], split_edge['train']['edge'], d=16, type='eigenmaps')
clf = make_pipeline(StandardScaler(), LogisticRegression()).fit(feats_train, labels_train)
eval(clf, emb, type="eigenmaps")

val hits@10: 0.02084
test hits@10: 0.04959
train hits@10: 0.02345


val hits@20: 0.02594
test hits@20: 0.05868
train hits@20: 0.02936


val hits@30: 0.03549
test hits@30: 0.07048
train hits@30: 0.04047




##### $d=8$

In [14]:
emb, feats_train, labels_train = get_train_data(dataset[0], split_edge['train']['edge'], d=8, type='eigenmaps')
clf = make_pipeline(StandardScaler(), LogisticRegression()).fit(feats_train, labels_train)
eval(clf, emb, type="eigenmaps")

val hits@10: 0.02104
test hits@10: 0.04643
train hits@10: 0.02381


val hits@20: 0.02534
test hits@20: 0.05521
train hits@20: 0.02872


val hits@30: 0.03291
test hits@30: 0.06858
train hits@30: 0.03757


