In [1]:
import networkx as nx
from networkx.exception import NetworkXError

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse
import pickle

import torch

from dataloader import AmazonDataset
import models
from models import DistMulti, TransE
from training import TrainIterater
from evaluate import Evaluater

import optuna
import time 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import warnings
warnings.filterwarnings('ignore')

# データロード

In [2]:
model_name = 'TransE'
dataset = AmazonDataset('./data', model_name='TransE')
edges = [[r[0], r[1]] for r in dataset.triplet_df.values]

# ハイパラ
best_params = pickle.load(open('./best_param.pickle', 'rb'))

In [3]:
best_params

{'embedding_dim': 48.0,
 'batch_size': 384,
 'lr': 0.0007965458269452495,
 'weight_decay': 1.0276080015781148e-06,
 'warmup': 350,
 'lr_decay_every': 2,
 'lr_decay_rate': 0.6348448884535313}

# KG embedする

bestなハイパラパラメータを読み込んでepoch回す

In [4]:
def train_embed(params):
    
    # ハイパラ読み込み
    embedding_dim = best_params['embedding_dim']
    batch_size = best_params['batch_size']
    lr = best_params['lr']
    weight_decay = best_params['weight_decay']
    warmup = best_params['warmup']
    lr_decay_every = best_params['lr_decay_every']
    lr_decay_rate = best_params['lr_decay_rate']
    
    relation_size = len(set(list(dataset.triplet_df['relation'].values)))
    entity_size = len(dataset.entity_list)
    model = TransE(int(embedding_dim), relation_size, entity_size).to(device)
    iterater = TrainIterater(batch_size=int(batch_size), model_name=model_name)
    score =iterater.iterate_epoch(model, lr=lr, epoch=5000, weight_decay=weight_decay, warmup=warmup,
                           lr_decay_rate=lr_decay_rate, lr_decay_every=lr_decay_every, eval_every=1e+5)
    return model

# とりあえず初期化したモデルのembeddingを使って進める

In [5]:
embedding_dim = 16
relation_size = len(set(list(dataset.triplet_df['relation'].values)))
entity_size = len(dataset.entity_list)
embed_model = TransE(int(embedding_dim), relation_size, entity_size).to(device)

# PageRank

In [6]:
G = nx.DiGraph()
G.add_nodes_from([i for i in range(len(dataset.entity_list))])
G.add_edges_from(edges)

# sparse sim_matを作る

In [16]:
dataset = AmazonDataset('./data', model_name='TransE')

def mk_sparse_sim_mat(model, gamma):
    item_idx = torch.tensor([dataset.entity_list.index(i) for i in dataset.item_list], 
                        dtype=torch.long, device=device)

    user_idx = torch.tensor([dataset.entity_list.index(u) for u in dataset.user_list], 
                        dtype=torch.long, device=device)

    brand_idx = torch.tensor([dataset.entity_list.index(b) for b in dataset.brand_list], 
                        dtype=torch.long, device=device)
    
    # ここもっと上手く書きたい
    item_embed = model.entity_embed(item_idx)
    item_sim_mat = torch.mm(item_embed, torch.t(item_embed))
    item_sim_mat = gamma[0] * scipy.sparse.csr_matrix(item_sim_mat.to('cpu').detach().numpy().copy())

    user_embed = model.entity_embed(user_idx)
    user_sim_mat = torch.mm(user_embed, torch.t(user_embed))
    user_sim_mat = gamma[1] * scipy.sparse.csr_matrix(user_sim_mat.to('cpu').detach().numpy().copy())

    brand_embed = model.entity_embed(brand_idx)
    brand_sim_mat = torch.mm(brand_embed, torch.t(brand_embed))
    brand_sim_mat = gamma[2] * scipy.sparse.csr_matrix(brand_sim_mat.to('cpu').detach().numpy().copy())

    M = scipy.sparse.block_diag((item_sim_mat, user_sim_mat, brand_sim_mat))
    M_ = np.array(1 - M.sum(axis=1) / np.max(M.sum(axis=1)))
                                    
    M = M / np.max(M.sum(axis=1)) + scipy.sparse.diags(M_.transpose()[0])
    #print(type(M))
    #print(M.shape)
    return M

In [8]:
def pagerank_scipy(G, sim_mat,  personal_vec=None, alpha=0.85, beta=0.01,
                   max_iter=500, tol=1.0e-6, weight='weight',
                   dangling=None):
    
    import scipy.sparse

    N = len(G)
    if N == 0:
        return {}

    nodelist = G.nodes()
    M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
                                  dtype=float)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    M = Q * M

    # 遷移行列とsim_matを統合
    #sim_mat = mk_sparse_sim_mat(G, item_mat)
    M = beta * M + (1 - beta) * sim_mat
    
    # initial vector
    x = scipy.repeat(1.0 / N, N)

    
    # Personalization vector
    p = personal_vec
 
    dangling_weights = p
    is_dangling = scipy.where(S == 0)[0]


    #print(x.shape)
    #print(M.shape)
    #print(p.shape)
    
    ppr_mat = []
    for i in range(p.shape[1]):
        ppr = power_iterate(N, M, x, p[:, i], dangling_weights[:, i], is_dangling, 
                            alpha, max_iter, tol)
        ppr_mat.append(ppr)
        
        #if i > 100:
        #    print(np.array(ppr_mat).shape)
        #    break 
        
    return np.array(ppr_mat)
    

def power_iterate(N, M, x, p, dangling_weights, is_dangling, alpha, max_iter=500, tol=1.0e-6):
    #print(M.shape)
    #print(x.shape)
    #print(p.shape)
    # power iteration: make up to max_iter iterations
    for i in range(max_iter):
        xlast = x
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
            (1 - alpha) * p
        # check convergence, l1 norm
        x = x / x.sum()
        err = scipy.absolute(x - xlast).sum()
        if err < N * tol:
            #return dict(zip(nodelist, map(float, x)))
            #print(i)
            return x
    # pagerankの収束ちゃんとやっとく
    print(x.sum())
    print(err)
    print(N * tol)
    #raise NetworkXError('pagerank_scipy: power iteration failed to converge '
                        #'in %d iterations.' % max_iter)
        
    #return dict(zip(nodelist, map(float, x)))
    return x

In [None]:
# personal_vecを作る(eneity_size * user_size)
user_idx = [dataset.entity_list.index(u) for u in dataset.user_list]
personal_vec = []
for u in user_idx:
    val = np.zeros(len(G.nodes()))
    val[u] = 1
    personal_vec.append(val[np.newaxis, :])
personal_vec = np.concatenate(personal_vec, axis=0).transpose()
#sim_mat = mk_sparse_sim_mat(embed_model)
sim_mat = mk_sparse_sim_mat(model)
%time ppr = pagerank_scipy(G, sim_mat, personal_vec, alpha=0.5, beta=0.5)

In [None]:
item_idx = [dataset.entity_list.index(i) for i in dataset.item_list]
pred = ppr[:, item_idx]

In [84]:
pred.shape

(102, 1581)

In [67]:
np.argsort(ppr[2])[::-1][0:10]

array([1583,  467,   35,  731,  473,  652,  898, 1369, 1498,  562])

In [68]:
ppr.shape

(5, 5407)

In [10]:
def pagerank_torch(G, sim_mat, personal_vec, alpha=0.85, beta=0.01,
                   max_iter=700, tol=1.0e-6, batch_size=512):

    N = len(G)
    if N == 0:
        return {}

    nodelist = G.nodes()
    M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, dtype=float)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    M = Q * M
    
    # 遷移行列とsim_matを統合
    M = beta * M + (1 - beta) * sim_mat
    M = torch.tensor(M.todense(), dtype=torch.float, device=device).to_sparse() # todenseどうにかしたい
    #print(M.shape)
    
    # Personalization vector
    p = torch.tensor(personal_vec, dtype=torch.float, device=device)
    #print(p.shape)
    
    # initial vector
    x = torch.ones(N, p.shape[1], dtype=torch.float, device=device) * 1 / N
    #print(x.shape)
    
    # Dangling nodes
    dangling_weights = p
    is_dangling = scipy.where(S == 0)[0]
    #print(is_dangling)
    
    # power iteration: make up to max_iter iterations
    for _ in range(max_iter):
        xlast = x
        x = alpha * (torch.sparse.mm(M, x) + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p
        x = x / x.sum(axis=0)
        # check convergence, l1 norm
        err = torch.abs(x - xlast).sum()
        
        if err < N * tol * N:
            return x
        #print(x.shape)
        #break
        
    return x

In [14]:
def item_ppr(sim_mat, alpha, beta):
    
    # personal_vecを作る(eneity_size * user_size)
    user_idx = [dataset.entity_list.index(u) for u in dataset.user_list]
    personal_vec = []
    for u in user_idx:
        val = np.zeros(len(G.nodes()))
        val[u] = 1
        personal_vec.append(val[np.newaxis, :])
    personal_vec = np.concatenate(personal_vec, axis=0).transpose()
    
    #ppr = pagerank_torch(G, sim_mat, personal_vec, alpha, beta)
    ppr = pagerank_scipy(G, sim_mat, personal_vec, alpha, beta)
    
    item_idx = [dataset.entity_list.index(i) for i in dataset.item_list]
    pred = ppr[:, item_idx]
    print(pred.shape)
    return pred



def get_ranking_mat(model, gamma, alpha=0.85, beta=0.01):
    ranking_mat = []
    sim_mat = mk_sparse_sim_mat(model, gamma)
    pred = item_ppr(sim_mat, alpha, beta)
    #print(pred.shape)
    for i in range(len(dataset.user_list)):
        sorted_idx = np.argsort(np.array(pred[i]))[::-1]
        ranking_mat.append(sorted_idx)
        #break
    return ranking_mat

# Evaluate

In [11]:
user_idx = [dataset.entity_list.index(u) for u in dataset.user_list]
user_items_test_dict = pickle.load(open('./data/user_items_test_dict.pickle', 'rb'))

def topn_precision(ranking_mat, user_items_dict, n=10):
    not_count = 0
    precision_sum = 0
        
    for i in range(len(ranking_mat)):
        if len(user_items_dict[user_idx[i]]) == 0:
            not_count += 1
            continue
        sorted_idx = ranking_mat[i]
        topn_idx = sorted_idx[:n]  
        hit = len(set(topn_idx) & set(user_items_dict[user_idx[i]]))
        precision = hit / len(user_items_dict[user_idx[i]])
        precision_sum += precision
        
    return precision_sum / (len(user_idx) - not_count)

# Optuna

In [12]:
# train embed model
#model = train_embed(best_params)
model = pickle.load(open('model.pickle', 'rb'))

In [17]:
def time_since(runtime):
    mi = int(runtime / 60)
    sec = int(runtime - mi * 60)
    return (mi, sec)

def objective(trial):
    start = time.time()
    #gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3)
    #lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])
    #slim = train_SLIM(lin_model, gamma)

    alpha = trial.suggest_uniform('alpha', 0, 1)
    beta = trial.suggest_uniform('beta', 0, 0.5)
    gamma1 = trial.suggest_uniform('gamma1', 0, 1)
    gamma2 = trial.suggest_uniform('gamma2', 0, 1)
    gamma3 = trial.suggest_uniform('gamma3', 0, 1)
    gamma = [gamma1, gamma2, gamma3]
    
    ranking_mat = get_ranking_mat(model, gamma, alpha, beta)
    print(ranking_mat[0:5])
    score = topn_precision(ranking_mat, user_items_test_dict)
    mi, sec = time_since(time.time() - start)
    print('{}m{}sec'.format(mi, sec))
    
    return -1 * score

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=30)

(3819, 1581)
[array([1479, 1205,    6, ...,  958,  298,  597]), array([  79,  766,  507, ..., 1369,  877,  527]), array([467, 731, 473, ..., 958, 298, 597]), array([ 298,  922,  597, ...,   35, 1498, 1369]), array([ 504,   93, 1164, ...,  198,  298,  597])]
6m4sec


[I 2020-07-07 11:53:04,193] Finished trial#0 with value: -0.345025342151754 with parameters: {'alpha': 0.05874091495623923, 'beta': 0.24495129436241087, 'gamma1': 0.7231491864979097, 'gamma2': 0.7755096347357538, 'gamma3': 0.4440974050775628}. Best is trial#0 with value: -0.345025342151754.


(3819, 1581)
[array([1479, 1205,    6, ...,  958,  298,  597]), array([ 79, 507, 766, ..., 958, 877, 527]), array([ 467,   35, 1369, ...,  958,  298,  597]), array([ 298,  597,  922, ...,   35, 1498, 1369]), array([ 504,   93, 1164, ...,  198,  298,  597])]
9m0sec


[I 2020-07-07 12:02:04,533] Finished trial#1 with value: -0.33974182343276693 with parameters: {'alpha': 0.17020148744273011, 'beta': 0.11901665872614742, 'gamma1': 0.0024022186130915335, 'gamma2': 0.06280590991187662, 'gamma3': 0.9054363346325803}. Best is trial#0 with value: -0.345025342151754.


In [None]:
df = study.trials_dataframe() # pandasのDataFrame形式
df.to_csv('./hyparams_result_gamma.csv')

In [None]:
# save best params 
with open('best_param_gamma.pickle', 'wb') as f:
    pickle.dump(study.best_params, f)

3819