In [1]:
import sys
import pickle
from pprint import pprint 
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from data.make_casting_graph import oneway_to_bidirected_graph
from scipy.sparse import csc_matrix
import time
from pagerank import pagerank
from sklearn.preprocessing import normalize
from pyvis.network import Network

In [2]:
# create idx to num comments
with open('./data/ratings.csv', encoding='utf-8') as f:
    docs = [line.strip().split(',') for line in f.readlines()[1:]]
    _idx2numcomments = {movie_idx:int(num) for num, movie_idx in docs}

In [3]:
# pre defined casting weight graph
with open('./data/casting_graph.pkl', 'rb') as f:
    graph = pickle.load(f)

In [4]:
# create idx to actor name function
with open('./data/actors.csv', encoding='utf-8') as f:
    next(f)
    docs = [line.split(',') for line in f.readlines()[1:]]
    # English name if exist else Korean name
    _idx2actor = {doc[0]:doc[1] for doc in docs}

In [5]:
with open('./data/movies.csv', encoding='utf-8') as f:
    next(f)
    docs = [line.split(',') for line in f.readlines()[1:]]
    _idx2movie = {doc[0]:doc[1] for doc in docs if len(docs)}

In [6]:
idx2movie = lambda idx: _idx2movie.get(idx, 'Unknown')
idx2actor = lambda idx: _idx2actor.get(idx, 'Unknown')
idx2numcomments = lambda idx: _idx2numcomments.get(idx,0)

In [7]:
g = oneway_to_bidirected_graph(graph)

In [8]:
for movie in sorted(_idx2numcomments.items(), key=lambda x: x[1], reverse=True)[:10] :
    print(idx2movie(movie[0]), movie[1])

기생충 40
극한직업 15
마약왕 15
인터스텔라 14
어벤져스: 엔드게임 12
걸캅스 12
마녀 12
택시운전사 11
배심원들 11
신과함께-죄와 벌 11


In [9]:
def _normalize(G):
    """It returns outbound normalized graph
    Arguments
    ---------
    G: inbound graph dict of dict
    """
    # Sum of outbound weight
    # t: to node, f: from node, w: weight
    W_sum = {}    
    for t, f_dict in G.items():
        for f, w in f_dict.items():
            W_sum[f] = W_sum.get(f, 0) + w
    A = {t:{f:w/W_sum[f] for f,w in f_dict.items()} for t, f_dict in G.items()}    
    nodes = set(G.keys())
    nodes.update(W_sum)
    return A, nodes

In [10]:
def pagerank(G, bias=None, df=0.15,
             max_iter=50, converge_error=0.001,verbose=0):
    """
    Arguments
    ---------
    G: Inbound graph, dict of dict
        G[to_node][from_node] = weight (float)
    df: damping factor, float. default 0.15
    """
    
    A, nodes = _normalize(G)
    N = len(nodes) # number of nodes
    sr = 1 - df # survival rate (1 -  damping factor)
    ir = 1 / N # initial rank

    # Initialization
    rank = {n:ir for n in nodes}

    # Initialization of bias
    if not bias:
        bias = {node:ir for node in nodes}

    # Iteration
    for _iter in range(1, max_iter + 1):
        rank_new = {}

        # t: to node, f: from node, w: weight
        for t in nodes:
            f_dict = A.get(t, {})
            rank_t = sum((w*rank[f] for f, w in f_dict.items())) if f_dict else 0
            rank_t = sr * rank_t + df * bias.get(t, 0)
            rank_new[t] = rank_t

        # convergence check
        diff = sum((abs(rank[n] - rank_new[n]) for n in nodes))
        if diff < converge_error:
            if verbose:
                print('Early stopped at iter = {}'.format(_iter))
            break

        if verbose:
            sum_ = sum(rank_new.values())
            print('Iteration = {}, diff = {}, sum = {}'.format(_iter, diff, sum_))

        rank = rank_new

    return rank

In [11]:
bias = {node:(idx2numcomments(node.split()[1]) if node[0] == 'm' else 0) for node in g}
_sum = sum(bias.values())
bias = {node:b / _sum for node, b in bias.items()}

In [12]:
starttime = time.time()
dict_rank = pagerank(g, bias=bias, df=0.15, max_iter=30, converge_error=0.001, verbose=1)
print('computation time : {} sec'.format(time.time() - starttime))

Iteration = 1, diff = 0.6745935594038649, sum = 1.0000000000000067
Iteration = 2, diff = 0.5133755765513079, sum = 1.0000000000000042
Iteration = 3, diff = 0.4070843471025297, sum = 1.0000000000000075
Iteration = 4, diff = 0.32881145690448793, sum = 1.0000000000000009
Iteration = 5, diff = 0.26900006261697257, sum = 1.000000000000007
Iteration = 6, diff = 0.22172923044566537, sum = 0.9999999999999926
Iteration = 7, diff = 0.18372765496993088, sum = 0.9999999999999928
Iteration = 8, diff = 0.1529064807765564, sum = 1.0000000000000049
Iteration = 9, diff = 0.127563916243621, sum = 0.9999999999999911
Iteration = 10, diff = 0.10676563571706423, sum = 0.9999999999999952
Iteration = 11, diff = 0.08947335545631432, sum = 1.0000000000000075
Iteration = 12, diff = 0.07517014319662835, sum = 1.0000000000000102
Iteration = 13, diff = 0.0631852881114483, sum = 0.9999999999999936
Iteration = 14, diff = 0.05320609097840679, sum = 0.9999999999999902
Iteration = 15, diff = 0.044830477927067305, sum = 

In [13]:
movie_dict_rank = {node:dict_rank for node, dict_rank in dict_rank.items() if node[0].split()[0] == 'm'}
    
for movie in sorted(movie_dict_rank.items(), key=lambda x: x[1], reverse=True)[:10] :
    movieId = movie[0].split()[1]
    print('{}, {}, {}'.format(movieId, idx2movie(movieId), movie[1]))

161967, 기생충, 0.0032033878121671224
167651, 극한직업, 0.0014303471787626468
175322, 마녀, 0.0011565783119412997
156464, 보헤미안 랩소디, 0.0011527961465662747
130966, 부산행, 0.001098819013448319
177483, 배심원들, 0.0009469824923736168
174065, 걸캅스, 0.0009354687095915042
37886, 클레멘타인, 0.000918249213245038
154449, 리틀 포레스트, 0.00091821747845663
163788, 알라딘, 0.0007997936563664337


In [14]:
#node index
nodes = set(g.keys())
idx2node = list(sorted(nodes))
node2idx = {node:idx for idx, node in enumerate(idx2node)}

#bias
bias = np.array([b for node, b in sorted(bias.items(), key=lambda tp:node2idx[tp[0]])])

#transform g to sparse matrix
rows = []
cols = []
data = []

for from_node, to_dict in g.items() :
    from_idx = node2idx[from_node]
    for to_node, weight in to_dict.items() :
        to_idx = node2idx[to_node]
        rows.append(from_idx)
        cols.append(to_idx)
        data.append(weight)
    
A = csc_matrix((data, (rows, cols)))

In [15]:
max_iter = 30
df = 0.85

ir = 1/A.shape[0]
num_rank = np.array([ir]*A.shape[0])

starttime = time.time()
for n_iter in range(1, max_iter + 1) :
    rank_new = A.dot(num_rank) # call scipy.sparse safe_sparse_dot()
    rank_new = normalize(rank_new.reshape(1,-1), norm='l1').reshape(-1)
    rank_new = df * rank_new + (1-df) * bias
    diff = abs(num_rank - rank_new).sum()
    num_rank = rank_new
    print('iter {} : diff = {}'.format(n_iter, diff))
print('computation time : {} sec'.format(time.time() - starttime))

iter 1 : diff = 0.1685245368865779
iter 2 : diff = 0.123534416788289
iter 3 : diff = 0.11717242074154521
iter 4 : diff = 0.08676250638774644
iter 5 : diff = 0.08106650827175174
iter 6 : diff = 0.06044614044638538
iter 7 : diff = 0.05589952786903922
iter 8 : diff = 0.04188475454126574
iter 9 : diff = 0.038452782327255894
iter 10 : diff = 0.0289095171904886
iter 11 : diff = 0.026405522194198443
iter 12 : diff = 0.01994486388644759
iter 13 : diff = 0.01811137289916391
iter 14 : diff = 0.013753287448751986
iter 15 : diff = 0.012408911428306675
iter 16 : diff = 0.009469243738374537
iter 17 : diff = 0.008494000468005527
iter 18 : diff = 0.006511648928942716
iter 19 : diff = 0.005809774127703195
iter 20 : diff = 0.004473307017566352
iter 21 : diff = 0.0039712967053357525
iter 22 : diff = 0.0030704578506105173
iter 23 : diff = 0.0027152845982687866
iter 24 : diff = 0.002106149459828414
iter 25 : diff = 0.0018577039374234091
iter 26 : diff = 0.0014438021951808503
iter 27 : diff = 0.001270456142

In [16]:
#m_rank = {idx2movie[idx]:value for idx, value in enumerate(num_rank)}
m_rank = {idx2node[idx]:node for idx, node in enumerate(num_rank)}
movie_num_rank = {node:m_rank for node, m_rank in m_rank.items() if node[0] == 'm'}

for movie in sorted(movie_num_rank.items(), key=lambda x: x[1], reverse=True)[:10] :
    movieId = movie[0].split()[1]
    print('{}, {}, {}'.format(movieId, idx2movie(movieId), movie[1]))

161967, 기생충, 0.0015437432925532173
156464, 보헤미안 랩소디, 0.0010864984266341052
175322, 마녀, 0.0008946794759721638
174065, 걸캅스, 0.0008564445054703045
167651, 극한직업, 0.0007648489380972874
37886, 클레멘타인, 0.000728929546919159
157297, 마약왕, 0.0007133104346250872
71509, 아저씨, 0.0006938076365826392
136900, 어벤져스: 엔드게임, 0.0006567566198412949
163788, 알라딘, 0.000638759850450271


In [17]:
movie_df = pd.DataFrame(sorted(movie_dict_rank.items(), key=lambda x: x[1], reverse=True)[:10], columns=['movie', 'weight'])
movie_list = {node:g[node] for node in movie_df['movie']}
movie_node = []

neighbor_movie = {}

for movie in movie_list.keys() :
    actor_list = movie_list[movie]
    n_movie_list = []
    for actor in actor_list.keys() :
        movie_node.append([movie.split()[1], actor.split()[1]])
        neighbor_movie.update({actor.split()[1]:[node.split()[1] for node, movieId in g.items() if actor in movieId]})

movie_node = pd.DataFrame(movie_node, columns=['movieId', 'actorId'])
#movie_node.head(5)

In [18]:
dict_net = Network(height='750px', width='100%')
dict_net.barnes_hut()
dict_top_movie = movie_node['movieId']
dict_targets = movie_node['actorId']

edge_dict_data = zip(dict_top_movie, dict_targets)

for dict_e in edge_dict_data :
    src = dict_e[0]
    dst = dict_e[1]
    
    dict_net.add_node(src, src, title=src)
    dict_net.add_node(dst, dst, title=dst)
    dict_net.add_edge(src, dst)
    
for dict_r in neighbor_movie :
    dst_list = neighbor_movie[dict_r]
    for dst in dst_list :
        dict_net.add_node(dst, dst, title=dst)
        dict_net.add_edge(dict_r, dst)
        
neighbor_map = dict_net.get_adj_list()

# add neighbor data to node hover data
for node in dict_net.nodes:
    node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
    node["value"] = len(neighbor_map[node["id"]])

dict_net.show("dict_node.html")

In [19]:
movie_df = pd.DataFrame(sorted(movie_num_rank.items(), key=lambda x: x[1], reverse=True)[:10], columns=['movie', 'weight'])
movie_list = {node:g[node] for node in movie_df['movie']}
movie_node = []

neighbor_movie = {}

for movie in movie_list.keys() :
    actor_list = movie_list[movie]
    for actor in actor_list.keys() :
        movie_node.append([movie.split()[1], actor.split()[1]])
        neighbor_movie.update({actor.split()[1]:[node.split()[1] for node, movieId in g.items() if actor in movieId]})
    #for actor in movie.keys()

movie_node = pd.DataFrame(movie_node, columns=['movieId', 'actorId'])
#movie_node.head(5)

In [20]:
num_net = Network(height='750px', width='100%')
num_net.barnes_hut()
num_top_movie = movie_node['movieId']
num_targets = movie_node['actorId']

edge_num_data = zip(num_top_movie, num_targets)

for num_e in edge_num_data :
    src = num_e[0]
    dst = num_e[1]
    
    num_net.add_node(src, src, title=src)
    num_net.add_node(dst, dst, title=dst)
    num_net.add_edge(src, dst)
    
    
for num_r in neighbor_movie :
    dst_list = neighbor_movie[num_r]
    for dst in dst_list :
        num_net.add_node(dst, dst, title=dst)
        num_net.add_edge(num_r, dst)
    
neighbor_map = num_net.get_adj_list()

# add neighbor data to node hover data
for node in num_net.nodes:
    node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
    node["value"] = len(neighbor_map[node["id"]])

num_net.show("num_node.html")