# PPRを動かしたい

In [3]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time

import torch
import optuna

# データロード

In [2]:
# データ読み込み
triplet_df = pd.read_csv('./data/triplet.csv')
edges = [[r[0], r[1]] for r in triplet_df.values]

entity_list = []
user_list =[]
item_list = []
with open('./data/entity_list.txt', 'r') as f:
    for l in f:
        entity_list.append(l.replace('\n', ''))
        
with open('./data/user_list.txt', 'r') as f:
    for l in f:
        user_list.append(l.replace('\n', ''))
        
with open('./data/item_list.txt', 'r') as f:
    for l in f:
        item_list.append(l.replace('\n', ''))
        
        
user_items_test_dict = pickle.load(open('./data/user_items_test_dict.pickle', 'rb'))

# ページランク

In [18]:
# グラフを作る
G = nx.DiGraph()
G.add_nodes_from([i for i in range(len(entity_list))])
G.add_edges_from(edges)

# tripletに重複が存在する

print('edges: {}'.format(len(G.edges)))
print('nodes: {}'.format(len(G.nodes)))


user_idx = [entity_list.index(u) for u in user_list]


def item_ppr(user, alpha):
    val = np.zeros(len(G.nodes))
    val[user] = 1
    k = [i for i in range(len(G.nodes))]
    personal_vec = dict(zip(k, val))
    #print(personal_vec)
    ppr = nx.pagerank_scipy(G, alpha=alpha)
    
    # random 後で消す
    #val = np.random.dirichlet([1 for i in range(len(G.nodes))], 1)[0]
    #k = [i for i in range(len(G.nodes))]
    #ppr = dict(zip(k, val))
    
    pred = []
    item_idx = [entity_list.index(i) for i in item_list]
    for i in item_idx:
        pred.append(ppr[i])
    
    return pred


def get_ranking_mat(alpha=0.85):
    ranking_mat = []
    count = 0
    for u in user_idx:
        pred = item_ppr(u, alpha)
        #print(pred)
        sorted_idx = np.argsort(np.array(pred))[::-1]
        ranking_mat.append(sorted_idx)
        
        #count += 1
        #if count > 100:
        #    break
            
    return ranking_mat

edges: 15899
nodes: 5407


# Evaluation

In [19]:
user_idx = [entity_list.index(u) for u in user_list]


def topn_precision(ranking_mat, user_items_dict, n=10):
    not_count = 0
    precision_sum = 0
        
    for i in range(len(ranking_mat)):
        if len(user_items_dict[user_idx[i]]) == 0:
            not_count += 1
            continue
        sorted_idx = ranking_mat[i]
        topn_idx = sorted_idx[:n]  
        hit = len(set(topn_idx) & set(user_items_dict[user_idx[i]]))
        precision = hit / len(user_items_dict[user_idx[i]])
        precision_sum += precision
        
    return precision_sum / (len(user_idx) - not_count)

# Optuna

In [24]:
def time_since(runtime):
    mi = int(runtime / 60)
    sec = int(runtime - mi * 60)
    return (mi, sec)


def objective(trial):
    start = time.time()
    alpha = trial.suggest_uniform('alpha', 0, 1)
    ranking_mat = get_ranking_mat(alpha)
    score = topn_precision(ranking_mat, user_items_test_dict)
    mi, sec = time_since(time.time() - start)
    print('{}m{}s'.format(mi, sec))
    return -1 * score

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=10)

6m19s


[I 2020-07-03 19:10:39,563] Finished trial#0 with value: -0.14020407472100876 with parameters: {'alpha': 0.23707059694512522}. Best is trial#0 with value: -0.14020407472100876.


6m41s


[I 2020-07-03 19:17:20,689] Finished trial#1 with value: -0.018151265023270503 with parameters: {'alpha': 0.8501888823232407}. Best is trial#0 with value: -0.14020407472100876.


In [30]:
if __name__ == '__main__':
    ranking_mat = get_ranking_mat()
    score = topn_precision(ranking_mat, user_items_test_dict)
    print(score)

0.005537243893912877


In [None]:
# パラメータを保存しておく
df = study.trials_dataframe() # pandasのDataFrame形式
df.to_csv('hyparams_result.csv')