Implementation of DeepWalk Algorithm to generate Node Embeddings.

In [13]:
import networkx as nx
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
dataset = "/Users/vanshgupta/Desktop/AI and ML reading material/GraphGAN_Project/GraphGAN/bio-grid-human/bio-grid-human_dataset.txt1"
embedding_filename = "/Users/vanshgupta/Desktop/AI and ML reading material/GraphGAN_Project/Emb and Data/Emb/biogrid-human/DeepWalk/emb.txt"
vector_size=50
epochs=1
window = 10
walk_length=100 
num_walks=80

In [15]:
df = pd.read_csv(dataset,
                sep = '\t',
                names = ["NodeIDfrom", "NodeIDto"],
                )
df

Unnamed: 0,NodeIDfrom,NodeIDto
0,0,1
1,1,0
2,2,3
3,0,28
4,3,2
...,...,...
62359,9431,1153
62360,9432,4234
62361,9433,4234
62362,9434,4234


In [16]:
#create the graph networkx object from the above dataframe

G = nx.from_pandas_edgelist(df = df,
                             source = "NodeIDfrom",
                             target = "NodeIDto",
                             create_using=nx.Graph())
len(G)

9436

In [17]:
def partition_num(num, workers):
    if num % workers == 0:
        return [num // workers] * workers
    else:
        return [num // workers] * workers + [num % workers]

In [18]:
import itertools
import math
import random

import pandas as pd
from joblib import Parallel, delayed


class RandomWalker:
    def __init__(self, G, p=1, q=1, use_rejection_sampling=False):
        """
        :param G:
        :param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
        :param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes
        :param use_rejection_sampling: Whether to use the rejection sampling strategy in node2vec.
        """
        self.G = G
        self.p = p
        self.q = q
        self.use_rejection_sampling = use_rejection_sampling

    def deepwalk_walk(self, walk_length, start_node):

        walk = [start_node]

        while len(walk) < walk_length:
            cur = walk[-1]
            cur_nbrs = list(self.G.neighbors(cur))
            if len(cur_nbrs) > 0:
                walk.append(random.choice(cur_nbrs))
            else:
                break
        return walk
    
    def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):

        G = self.G

        nodes = list(G.nodes())

        results = Parallel(n_jobs=workers, verbose=verbose, )(
            delayed(self._simulate_walks)(nodes, num, walk_length) for num in
            partition_num(num_walks, workers))

        walks = list(itertools.chain(*results))

        return walks

    def _simulate_walks(self, nodes, num_walks, walk_length, ):
        walks = []
        for _ in range(num_walks):
            random.shuffle(nodes)
            for v in nodes:
                if self.p == 1 and self.q == 1:
                    walks.append(self.deepwalk_walk(
                        walk_length=walk_length, start_node=v))
        return walks

In [19]:
import gensim

In [20]:


class DeepWalk:
    def __init__(self, graph, walk_length, num_walks, workers=1):

        self.graph = graph
        self.w2v_model = None
        self._embeddings = {}

        self.walker = RandomWalker(
            graph, p=1, q=1, )
        self.sentences = self.walker.simulate_walks(
            num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)

    def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):

        kwargs["sentences"] = self.sentences
        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["vector_size"] = embed_size
        kwargs["sg"] = 1  # skip gram
        kwargs["hs"] = 1  # deepwalk use Hierarchical Softmax
        kwargs["workers"] = workers
        kwargs["window"] = window_size
        kwargs["epochs"] = iter

        print("Learning embedding vectors...")
        model = gensim.models.Word2Vec(**kwargs)
        print("Learning embedding vectors done!")

        self.w2v_model = model
        return model

In [21]:
deepwalk = DeepWalk(G, walk_length=walk_length, num_walks=num_walks)
sentences = deepwalk.sentences

In [22]:

import gensim
model = gensim.models.Word2Vec(sentences=sentences,
                 vector_size=vector_size,
                 epochs=epochs,
                 window = window,
                 compute_loss=True,
                 sg=1,
                 hs=0)

In [23]:
   
embeddings = []
for i in G.nodes():
    embeddings.append(model.wv[i])
embeddings = np.array(embeddings)
embeddings[0], np.max(embeddings)

(array([ 2.96153158e-01, -1.04267165e-01,  4.26950939e-02,  2.57463127e-01,
         2.28103939e-02, -3.89591932e-01,  3.12267214e-01, -6.11948036e-02,
         1.72627553e-01,  5.45855463e-02,  9.65009779e-02, -2.11285546e-01,
         3.75170738e-01,  8.74809623e-01,  3.51438999e-01, -1.10729681e-02,
        -5.69812596e-01,  1.95666507e-01, -1.31305292e-01, -2.16847882e-01,
        -5.35295159e-03,  2.96301037e-01,  9.84260857e-01,  7.08460286e-02,
        -2.94414908e-01,  1.28813297e-01,  6.32580519e-01,  1.09612811e+00,
        -2.93216616e-01, -2.38761678e-01,  2.97940582e-01,  9.42010581e-02,
         2.20003352e-01, -2.16920540e-01, -6.82325304e-01,  1.78416550e-01,
         2.23131299e-01,  9.49125648e-01, -8.14040244e-01, -1.34721234e-01,
        -2.41122469e-02,  1.00027840e-03, -4.34153602e-02, -3.41843992e-01,
        -4.00350630e-01,  4.55939084e-01,  2.38143608e-01, -7.07993925e-01,
         4.89831299e-01,  3.21701944e-01], dtype=float32),
 4.219175)

In [24]:
import os

index = np.array(G.nodes()).reshape(-1, 1)
embedding_matrix = np.hstack([index, embeddings])
embedding_list = embedding_matrix.tolist()
embedding_str = [str(int(emb[0])) + " " + " ".join([str(x) for x in emb[1:]]) + "\n"
                for emb in embedding_list]
with open(embedding_filename, "w+") as f:
    lines = [str(G.number_of_nodes()) + "\t" + str(50) + "\n"] + embedding_str
    f.writelines(lines)
