# PPRを動かしたい

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time

import torch
import optuna

# データロード

In [4]:
# データ読み込み
triplet_df = pd.read_csv('../data_luxury_5core/triplet.csv')
edges = [[r[0], r[1]] for r in triplet_df.values]

entity_list = []
user_list =[]
item_list = []
with open('../data_luxury_5core/entity_list.txt', 'r') as f:
    for l in f:
        entity_list.append(l.replace('\n', ''))
        
with open('../data_luxury_5core/user_list.txt', 'r') as f:
    for l in f:
        user_list.append(l.replace('\n', ''))
        
with open('../data_luxury_5core/item_list.txt', 'r') as f:
    for l in f:
        item_list.append(l.replace('\n', ''))
        
        
user_items_test_dict = pickle.load(open('../data_luxury_5core/user_items_test_dict.pickle', 'rb'))

# ページランク

In [21]:
# グラフを作る
G = nx.DiGraph()
G.add_nodes_from([i for i in range(len(entity_list))])
G.add_edges_from(edges)

# tripletに重複が存在する

print('edges: {}'.format(len(G.edges)))
print('nodes: {}'.format(len(G.nodes)))


user_idx = [entity_list.index(u) for u in user_list]


def item_ppr(alpha):
    ppr_mat = []
    for i in range(len(entity_list)):
        val = np.zeros(len(G.nodes))
        val[i] = 1
        k = [i for i in range(len(G.nodes))]
        personal_vec = dict(zip(k, val))
        #print(personal_vec)
        ppr = nx.pagerank_scipy(G, alpha=alpha)
        ppr_mat.append(list(ppr.values()))
        if i > 2:
            break
    return np.array(ppr_mat)

edges: 15938
nodes: 5407


In [22]:
mat = item_ppr(0.85)
mat.shape

(4, 5407)

# Evaluation

In [4]:
user_idx = [entity_list.index(u) for u in user_list]


def topn_precision(ranking_mat, user_items_dict, n=10):
    not_count = 0
    precision_sum = 0
        
    for i in range(len(ranking_mat)):
        if len(user_items_dict[user_idx[i]]) == 0:
            not_count += 1
            continue
        sorted_idx = ranking_mat[i]
        topn_idx = sorted_idx[:n]  
        hit = len(set(topn_idx) & set(user_items_dict[user_idx[i]]))
        precision = hit / len(user_items_dict[user_idx[i]])
        precision_sum += precision
        
    return precision_sum / (len(user_idx) - not_count)

# Optuna

In [5]:
def time_since(runtime):
    mi = int(runtime / 60)
    sec = int(runtime - mi * 60)
    return (mi, sec)


def objective(trial):
    start = time.time()
    alpha = trial.suggest_uniform('alpha', 0, 1)
    ranking_mat = get_ranking_mat(alpha)
    score = topn_precision(ranking_mat, user_items_test_dict)
    mi, sec = time_since(time.time() - start)
    print('{}m{}s'.format(mi, sec))
    return -1 * score

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=10)

10m34s


[I 2020-07-04 17:12:08,074] Finished trial#0 with value: -0.1467979644613531 with parameters: {'alpha': 0.030872309473542248}. Best is trial#0 with value: -0.1467979644613531.


10m52s


[I 2020-07-04 17:23:01,115] Finished trial#1 with value: -0.14020407472100876 with parameters: {'alpha': 0.3456070199729967}. Best is trial#0 with value: -0.1467979644613531.


10m36s


[I 2020-07-04 17:33:38,157] Finished trial#2 with value: -0.14020407472100876 with parameters: {'alpha': 0.3643424017968895}. Best is trial#0 with value: -0.1467979644613531.


10m53s


[I 2020-07-04 17:44:32,121] Finished trial#3 with value: -0.12642436583658934 with parameters: {'alpha': 0.48552198208655606}. Best is trial#0 with value: -0.1467979644613531.


10m48s


[I 2020-07-04 17:55:20,920] Finished trial#4 with value: -0.10303468532177196 with parameters: {'alpha': 0.5413877085877248}. Best is trial#0 with value: -0.1467979644613531.


10m51s


[I 2020-07-04 18:06:12,288] Finished trial#5 with value: -0.06699829738453084 with parameters: {'alpha': 0.6644803143296887}. Best is trial#0 with value: -0.1467979644613531.


11m25s


[I 2020-07-04 18:17:38,120] Finished trial#6 with value: -0.018151265023270503 with parameters: {'alpha': 0.834960907105007}. Best is trial#0 with value: -0.1467979644613531.


10m47s


[I 2020-07-04 18:28:25,995] Finished trial#7 with value: -0.06699829738453084 with parameters: {'alpha': 0.6734461747311459}. Best is trial#0 with value: -0.1467979644613531.


10m43s


[I 2020-07-04 18:39:09,993] Finished trial#8 with value: -0.14020407472100876 with parameters: {'alpha': 0.12444306704299513}. Best is trial#0 with value: -0.1467979644613531.


In [30]:
if __name__ == '__main__':
    ranking_mat = get_ranking_mat()
    score = topn_precision(ranking_mat, user_items_test_dict)
    print(score)

0.005537243893912877


In [None]:
# パラメータを保存しておく
df = study.trials_dataframe() # pandasのDataFrame形式
df.to_csv('hyparams_result.csv')