In [1]:
# pre defined casting weight graph
# see more data/movie-actor/make_casting_graph.py

import pickle
from pprint import pprint 
import sys
sys.path.append('/mnt/lovit/git/pagerank/data/movie-actor/')
sys.path.append('/mnt/lovit/git/pagerank/')

with open('/mnt/lovit/git/pagerank/data/movie-actor/casting_graph.pkl', 'rb') as f:
    graph = pickle.load(f)

# casting weight of movie = 10001
pprint(sorted(graph['10001'].items(), key=lambda x:-x[1]))

[('4374', 0.1711229946524064),
 ('178', 0.15040106951871657),
 ('3241', 0.13101604278074866),
 ('47952', 0.11296791443850267),
 ('47953', 0.0962566844919786),
 ('19538', 0.08088235294117647),
 ('18991', 0.06684491978609626),
 ('47954', 0.05414438502673797),
 ('6038', 0.0427807486631016),
 ('24102', 0.032754010695187165),
 ('47955', 0.02406417112299465),
 ('16903', 0.016711229946524065),
 ('47956', 0.0106951871657754),
 ('47957', 0.006016042780748663),
 ('47958', 0.00267379679144385),
 ('47959', 0.0006684491978609625)]


In [2]:
# create idx to actor name function
with open('/mnt/lovit/git/pagerank/data/movie-actor/actors.csv', encoding='utf-8') as f:
    next(f)
    docs = [line[:-1].split('\t') for line in f]
    # English name if exist else Korean name
    _idx2actor = {doc[0]:(doc[2] if doc[2] else doc[1]) for doc in docs}

idx2actor = lambda idx: _idx2actor.get(idx, 'Unknown')

# create idx to movie name function
def append_year_countries(year, countries):
    if year and countries:
        return ' ({}, {})'.format(year, countries)
    elif year:
        return ' ({})'.format(year)
    elif countries:
        return ' ({})'.format(countries)
    return ''

with open('/mnt/lovit/git/pagerank/data/movie-actor/movies.csv', encoding='utf-8') as f:
    next(f)
    docs = [line[:-1].split('\t') for line in f]
    _idx2movie = {doc[0]:'{}{}'.format(doc[1], append_year_countries(doc[4], doc[5])) for doc in docs if len(docs)}

idx2movie = lambda idx: _idx2movie.get(idx, 'Unknown')

In [3]:
from make_casting_graph import oneway_to_bidirected_graph
g = oneway_to_bidirected_graph(graph)

In [4]:
list(g.items())[0]

('movie 22682', {'actor 4009': 0.2, 'actor 7995': 0.8})

In [64]:
len(g)

265607

In [5]:
# to sparse matrix
nodes = set(g.keys())
nodes.update({node for wd in g.values() for node in wd})
print(len(nodes))

265607


In [6]:
idx2node = list(sorted(nodes))
node2idx = {node:idx for idx, node in enumerate(idx2node)}

In [56]:
# create idx to num comments
with open('/mnt/lovit/git/pagerank/data/movie-actor/num_comments.txt', encoding='utf-8') as f:
    docs = [line[:-1].split('\t') for line in f]
    _idx2numcomments = {movie_idx:int(num) for movie_idx, num in docs}

comments_bias = [_idx2numcomments.get(node.split()[1], 0) if 'movie' in node else 0 for node in sorted(node2idx, key=lambda x:node2idx[x])]
_sum = sum(comments_bias)
comments_bias = np.asarray([b / _sum for b in comments_bias])

In [7]:
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix

rows = []
cols = []
data = []

for from_node, to_dict in g.items():
    from_idx = node2idx[from_node]
    for to_node, weight in to_dict.items():
        to_idx = node2idx[to_node]
        rows.append(from_idx)
        cols.append(to_idx)
        data.append(weight)

#A = csr_matrix((data, (rows, cols)))
A = csc_matrix((data, (rows, cols)))

In [8]:
A.shape

(265607, 265607)

In [None]:
import numpy as np
from sklearn.preprocessing import normalize

In [37]:
# np.dot(A, rank)

In [21]:
rank_new = A.dot(rank) # call scipy.sparse safe_sparse_dot()

In [22]:
rank_new.sum(), rank.sum()

(0.9997310802270433, 0.9999999999999993)

In [23]:
rank_new = normalize(rank_new.reshape(1, -1), norm='l1').reshape(-1)
rank_new.sum()

1.0000000000000002

In [25]:
rank.shape, rank_new.shape

((265607,), (265607,))

In [65]:
import time

In [66]:
t = time.time()

max_iter = 30
df = 0.85

ir = 1 / A.shape[0]
rank = np.asarray([ir] * A.shape[0])
#bias = np.asarray([ir] * A.shape[0])
bias = comments_bias.copy()

for n_iter in range(1, max_iter + 1):
    rank_new = A.dot(rank) # call scipy.sparse safe_sparse_dot()
    rank_new = normalize(rank_new.reshape(1, -1), norm='l1').reshape(-1)
    rank_new = df * rank_new + (1 - df) * bias    
    diff = abs(rank - rank_new).sum()
    rank = rank_new
    print('iter {} : diff = {}'.format(n_iter, diff))

t = time.time() - t
print(t)

iter 1 : diff = 0.281436840863022
iter 2 : diff = 0.3970505378527365
iter 3 : diff = 0.17598404082440902
iter 4 : diff = 0.14807658516192676
iter 5 : diff = 0.09962315062320268
iter 6 : diff = 0.0765602968138843
iter 7 : diff = 0.058114626092355424
iter 8 : diff = 0.042949603335787775
iter 9 : diff = 0.03410703651803252
iter 10 : diff = 0.024663500399523496
iter 11 : diff = 0.020053770053409213
iter 12 : diff = 0.014331538003882077
iter 13 : diff = 0.011801592719125686
iter 14 : diff = 0.008385166921666137
iter 15 : diff = 0.0069544954062276815
iter 16 : diff = 0.004925645467125705
iter 17 : diff = 0.004106103285551124
iter 18 : diff = 0.002901720075500885
iter 19 : diff = 0.002430137343265774
iter 20 : diff = 0.0017143036073690432
iter 21 : diff = 0.001442295384774437
iter 22 : diff = 0.0010158693712554695
iter 23 : diff = 0.0008586564597530771
iter 24 : diff = 0.0006037068986667392
iter 25 : diff = 0.0005127185595059035
iter 26 : diff = 0.00035977812698531186
iter 27 : diff = 0.00030

In [62]:
rank_ = {idx2node[idx]:value for idx, value in enumerate(rank)}
movierank = {node:value for node, value in rank_.items() if 'movie' in node}
actorrank = {node:value for node, value in rank_.items() if 'actor' in node}

In [68]:
for movie, value in sorted(movierank.items(), key=lambda x:-x[1])[:50]:
    movie_idx = movie.split()[1]
    #print('{} : {}'.format(idx2movie(movie_idx), value))
    print(idx2movie(movie_idx))

26년 (2012, 한국)
부산행 (2016, 한국)
디 워 (2007, 한국)
곡성(哭聲) (2016, 한국)
7번방의 선물 (2013, 한국)
인터스텔라 (2014, 미국 영국)
인천상륙작전 (2016, 한국)
국제시장 (2014, 한국)
괴물 (2006, 한국)
국가대표 (2009, 한국)
암살 (2015, 한국)
베테랑 (2015, 한국)
아바타 (2009, 미국)
연평해전 (2015, 한국)
설국열차 (2013, 한국)
말할 수 없는 비밀 (2008, 대만)
겨울왕국 (2014, 미국)
왕의 남자 (2005, 한국)
캡틴 아메리카: 시빌 워 (2016, 미국)
님아, 그 강을 건너지 마오 (2014, 한국)
늑대소년 (2012, 한국)
귀향 (2016, 한국)
과속스캔들 (2008, 한국)
어벤져스: 에이지 오브 울트론 (2015, 미국)
세 얼간이 (2011, 인도)
다세포 소녀 (2006, 한국)
검사외전 (2016, 한국)
아저씨 (2010, 한국)
군도:민란의 시대 (2014, 한국)
광해, 왕이 된 남자 (2012, 한국)
해적: 바다로 간 산적 (2014, 한국)
해운대 (2009, 한국)
터널 (2016, 한국)
화려한 휴가 (2007, 한국)
아가씨 (2016, 한국)
럭키 (2016, 한국)
다크 나이트 라이즈 (2012, 미국 영국)
다이빙벨 (2014, 한국)
덕혜옹주 (2016, 한국)
아수라 (2016, 한국)
다크 나이트 (2008, 미국)
밀정 (2016, 한국)
인셉션 (2010, 미국 영국)
포화 속으로 (2010, 한국)
전우치 (2009, 한국)
검은 사제들 (2015, 한국)
히말라야 (2015, 한국)
트랜스포머 (2007, 미국)
7광구 (2011, 한국)
좋은 놈, 나쁜 놈, 이상한 놈 (2008, 한국)
