# 图游走算法

### Deepwalk

In [1]:
import networkx as nx
import numpy as np
from tqdm import tqdm
from gensim.models import word2vec
import matplotlib.pyplot as plt



In [2]:
def walkOneTime(g, start_node, walk_length):
    walk = [str(start_node)]
    for _ in range(walk_length):
        current_node = int(walk[-1])
        successors = list(g.successors(current_node))
        if len(successors) > 0:
            next_node = np.random.choice(successors, 1)
            walk.extend([str(n) for n in next_node])
    return walk

In [3]:
def getDeepwalkSeqs(g, walk_length, num_walks):
    seqs = []
    for _ in tqdm(range(num_walks)):
        start_node = np.random.choice(g.nodes)
        w = walkOneTime(g, start_node, walk_length)
        seqs.append(w)
    return seqs

In [4]:
def deepwalk(g, dimensions=10, walk_length=80, num_walks=10, min_count=3):
    seqs = getDeepwalkSeqs(g, walk_length=walk_length, num_walks=num_walks)
    model = word2vec.Word2Vec(seqs, vector_size=dimensions, min_count=min_count)
    return model

In [5]:
# 随机生成有向图
g = nx.fast_gnp_random_graph(n=100, p=0.5, directed=True)

In [6]:
def img(graph):
    nx.draw(graph, with_labels=True)
    plt.show()

In [7]:
"""
g.nodes() # 节点
g.edges() # 边
g.number_of_edges() # 边数
"""

'\ng.nodes() # 节点\ng.edges() # 边\ng.number_of_edges() # 边数\n'

In [8]:
model = deepwalk(g, dimensions=10, walk_length=20, num_walks=100, min_count=3)
print(model.wv.most_similar("2", topn=3)) # 与“2”最接近的三个节点

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 871.88it/s]

[('15', 0.7530400156974792), ('56', 0.7324971556663513), ('28', 0.7142529487609863)]





In [9]:
"""
# 把 emd 存储下来用于下游任务
model.wv.save_word2vec_format('e.emd')
# 存储模型
model.save("m.model")
"""

'\n# 把 emd 存储下来用于下游任务\nmodel.wv.save_word2vec_format(\'e.emd\')\n# 存储模型\nmodel.save("m.model")\n'

### Node2Vec

In [10]:
from node2vec import Node2Vec

In [13]:
# 生成一个无向图
graph = nx.fast_gnp_random_graph(n=100, p=0.5)

In [29]:
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=100, p=0.5, q=0.7, workers=1) # 初始化模型
model = node2vec.fit() # 训练模型

HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', style=ProgressStyle(…

Generating walks (CPU: 1):   2%|█                                                      | 2/100 [00:00<00:08, 11.66it/s]




Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████| 100/100 [00:18<00:00,  5.53it/s]


- 节点相似度

In [30]:
print(model.wv.most_similar("2",
                            # 与“2”最相近的三个节点
#                             topn=3 
                         ))

[('28', 0.6456018090248108), ('15', 0.5819611549377441), ('37', 0.559927225112915), ('67', 0.5542974472045898), ('61', 0.5529149174690247), ('99', 0.5485485792160034), ('30', 0.5466107130050659), ('19', 0.5418100953102112), ('59', 0.540520191192627), ('52', 0.5404849648475647)]


- 单节点编码

In [55]:
"""
for node in graph.nodes():
    print(model.wv[node].shape)
"""
    
model.wv["1"]

array([-0.05670424, -0.11776148,  0.16466549,  0.17130235,  0.14641602,
        0.10605874,  0.0537626 , -0.14051244, -0.1390336 ,  0.08914851,
        0.05887314, -0.3729583 , -0.22055124,  0.04805889, -0.0421535 ,
        0.2025673 , -0.0648834 ,  0.05988329,  0.08195539,  0.22886425,
        0.1707751 ,  0.19753553,  0.07320962, -0.13405655, -0.28439248,
       -0.12541465, -0.09076144, -0.10836741, -0.14354342,  0.11156682,
        0.02959628, -0.03083695, -0.13827212, -0.08467235, -0.0039972 ,
        0.07067358,  0.04685776, -0.16219348,  0.10639705,  0.14086273,
       -0.09025347, -0.10162745,  0.12622674, -0.13905063,  0.14360055,
       -0.15669487,  0.15933496,  0.0879701 ,  0.13707766, -0.10728742,
        0.06012687, -0.16697957,  0.22834174,  0.09986109,  0.11671104,
        0.06285142, -0.01544343,  0.06305712, -0.00293846,  0.3721547 ,
       -0.01034816, -0.07619656, -0.17395972, -0.12744948], dtype=float32)

- 寻找节点与“1、2”相似、与“3”不相似的

In [44]:
model.wv.most_similar(positive=['1', '2'], negative=['3'], topn=1)

[('28', 0.4699561595916748)]

- “1 2 3 4”集合中不相似的节点

In [57]:
model.wv.doesnt_match("1 2 3 4".split())

'3'

- 节点“1”和“2”的相似度

In [47]:
model.wv.similarity('1', '2')

0.37870058

- 两个集合“list1”和“list2”的余弦相似度

In [54]:
list1 = ['1', '2', '3', '4']
list2 = ['43', '23', '12']
list_siml = model.wv.n_similarity(list1, list2)
print(list_siml)

0.81964993
