In [1]:
import networkx as nx
import random
random.seed(0)
import numpy as np
np.random.seed(0)

G = nx.erdos_renyi_graph(10,0.3,seed=1,directed=False)

In [None]:
def next_node(previous,current,p,q):
    neighbors = list(G.neighbors(current)) #현재 노드의 이웃노드 목록들
    alphas = []

    for neighbor in neighbors:
        if neighbor == previous: #이전 노드로 되돌아가는 경우
            alpha = 1/p
        elif G.has_edge(neighbor,previous): #다음 노드가 이전 노드와 이웃(BFS)
            alpha = 1
        else: #더 깊게 들어가는 경우(DFS)
            alpha = 1/q
        alphas.append(alpha)

    probs = [alpha/sum(alphas) for alpha in alphas]
    next = np.random.choice(neighbors,size=1,p=probs)[0]
    return next


In [8]:
def random_walk(start,length,p,q):
    walk = [start]

    for _ in range(length):
        current = walk[-1]
        previous = walk[-2] if len(walk)>1 else None
        next = next_node(previous,current,p,q)
        walk.append(next)
    
    return [str(x) for x in walk]

In [9]:
random_walk(0,8,p=1,q=1)

['0', '4', '7', '6', '4', '5', '4', '5', '6']

In [10]:
random_walk(0,8,1,10)

['0', '9', '1', '9', '1', '9', '1', '0', '1']

In [11]:
random_walk(0,8,10,1)

['0', '1', '9', '4', '7', '8', '7', '4', '6']

In [12]:
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [13]:
G = nx.karate_club_graph()

In [14]:
labels = []
for node in G.nodes:
    label = G.nodes[node]['club']
    labels.append(1 if label=='Officer' else 0)

walks = []
for node in G.nodes:
    for _ in range(80):
        walks.append(random_walk(node,10,3,2))

In [15]:
node2vec = Word2Vec(walks,
                    hs=1,
                    sg=1,
                    vector_size=100,
                    window=10,
                    workers=2,
                    min_count=1,
                    seed=0)

In [16]:
node2vec.train(walks,total_examples=node2vec.corpus_count,epochs=30,report_delay=1)

(185807, 897600)

In [17]:
train_mask = [i for i in range(2,25,2)]
train_mask_str = [str(x) for x in train_mask]
test_mask = [0,1,3,5,7,9,11,13,15,17,19,21,23,25,26,27,28,29,30,31,32,33]
test_mask_str = [str(x) for x in test_mask]
labels = np.array(labels)

In [18]:
clf = RandomForestClassifier(random_state=0)
clf.fit(node2vec.wv[train_mask_str],labels[train_mask])

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
y_pred = clf.predict(node2vec.wv[test_mask_str])
acc = accuracy_score(y_pred,labels[test_mask])
print(f'Node2Vec accuracy score = {acc*100:.2f}%')

Node2Vec accuracy score = 95.45%


In [27]:
#영화 추천시스템

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

url = 'https://files.grouplens.org/datasets/movielens/ml-100k.zip'
with urlopen(url) as zurl:
    with ZipFile(BytesIO(zurl.read())) as zfile:
        zfile.extractall('.')

In [28]:
import pandas as pd
ratings = pd.read_csv('ml-100k/u.data',sep='\t',names=['user_id','movie_id',
                                                       'rating','unix_timestamp'])
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [29]:
movies = pd.read_csv('ml-100k/u.item',sep='|',usecols=range(2),
                     names=['movie_id','title'],
                     encoding='latin-1')

In [30]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [34]:
ratings = ratings[ratings.rating>=4]
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
5,298,474,4,884182806
7,253,465,5,891628467
11,286,1014,5,879781125
12,200,222,5,876042340
16,122,387,5,879270459
...,...,...,...,...
99988,421,498,4,892241344
99989,495,1091,4,888637503
99990,806,421,4,882388897
99991,676,538,4,892685437


In [None]:
from collections import defaultdict

pairs = defaultdict(int)
for group in ratings.groupby('user_id'):
    user_movies = list(group[1]['movie_id'])
    #groupby를 진행하면, group[0] = user_id, group[1]은 기준키에 해당하는 모든 데이터를 의미
    for i in range(len(user_movies)):
        for j in range(i+1,len(user_movies)):
            pairs[(user_movies[i],user_movies[j])]+=1

In [58]:
G = nx.Graph()
for pair in pairs:
    movie1,movie2 = pair
    score = pairs[pair]

    if score >= 20:
        G.add_edge(movie1,movie2,weight=score)

In [59]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Collecting numpy<2.0.0,>=1.24.0 (from node2vec)
  Using cached numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (61 kB)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Using cached numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl (14.0 MB)
Installing collected packages: numpy, node2vec
[2K  Attempting uninstall: numpy
[2K    Found existing installation: numpy 2.2.6
[2K    Uninstalling numpy-2.2.6:
[2K      Successfully uninstalled numpy-2.2.6
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [node2vec]
[1A[2K[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
tensorflow-macos 2.16.2 requires tensorflow==2.16.2; platfo

In [66]:
from node2vec import Node2Vec

In [None]:
n2v = Node2Vec(G,dimensions=64,walk_length=20,num_walks=200,p=2,q=1,workers=1)
#그래프를 입력받고 랜덤워크를 생성하는 단계

Computing transition probabilities:   0%|          | 0/410 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 200/200 [00:13<00:00, 14.48it/s]


In [None]:
model = n2v.fit(window=10,min_count=1,batch_words=4)
#생성된 랜덤워크를 워드투벡(스킵그램) 방식으로 학습함

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [114]:
def recommend(movie):
    movie_id = str(movies[movies.title==movie].movie_id.values[0])

    for id in model.wv.most_similar(movie_id)[:5]:
        title = movies.loc[movies.movie_id == int(id[0]),'title'].iloc[0]
        print(f'{title} : {id[1]:.2f}')

In [116]:
recommend('Star Wars (1977)')

Return of the Jedi (1983) : 0.63
Raiders of the Lost Ark (1981) : 0.60
Silence of the Lambs, The (1991) : 0.51
Toy Story (1995) : 0.50
Indiana Jones and the Last Crusade (1989) : 0.47
