In [12]:
import networkx as nx
from networkx.exception import NetworkXError

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse
import pickle

import torch

from dataloader import AmazonDataset
import models
from models import DistMulti, TransE
from training import TrainIterater
from evaluate import Evaluater

import optuna
device = 'cpu'

# データロード

In [51]:
model_name = 'TransE'
dataset = AmazonDataset('./data', model_name='TransE')

edges = [[r[0], r[1]] for r in dataset.triplet_df.values]

# KG embedする

bestなハイパラパラメータを読み込んでepoch回す

In [16]:
def train_embed(file_path):
    
    # ハイパラ読み込み
    
    
    relation_size = len(set(list(dataset.triplet_df['relation'].values)))
    entity_size = len(dataset.entity_list)
    model = TransE(int(embedding_dim), relation_size, entity_size)
    iterater = TrainIterater(batch_size=int(batch_size), model_name=model_name)
    score =iterater.iterate_epoch(model, lr=lr, epoch=50, weight_decay=weight_decay, warmup=warmup,
                           lr_decay_rate=lr_decay_rate, lr_decay_every=lr_decay_every, eval_every=10)

# 学習済みembedモデルを読み込む

# とりあえず初期化したモデルのembeddingを使って進める

In [45]:
embedding_dim = 16
relation_size = len(set(list(dataset.triplet_df['relation'].values)))
entity_size = len(dataset.entity_list)
embed_model = TransE(int(embedding_dim), relation_size, entity_size)

# PageRank

- sim_mat(item_len * item_len)を使って隣接行列を作る
- 隣接行列((item_len + user_len + brand_len) * (item_len + user_len + brand_len))
- nx.google_matrixを参考に隣接行列を作る  


danglingあんまり効果がわからないので注意

In [52]:
G = nx.DiGraph()
G.add_nodes_from([i for i in range(len(dataset.entity_list))])
G.add_edges_from(edges)

In [8]:
def google_matrix(G, item_mat=None, alpha=0.85, beta=0.01, personalization=None,
                  weight='weight', dangling=None):

    nodelist = G.nodes()

    M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight)
    N = len(G)
    if N == 0:
        return M

    # Personalization vector
    if personalization is None:
        p = np.repeat(1.0 / N, N)
    else:
        missing = set(nodelist) - set(personalization)
        if missing:
            raise NetworkXError('Personalization vector dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        p = np.array([personalization[n] for n in nodelist], dtype=float)
        p /= p.sum()

    #print(p)
    #print(p.shape)
        
    # Dangling nodes
    if dangling is None:
        dangling_weights = p
    else:
        missing = set(nodelist) - set(dangling)
        if missing:
            raise NetworkXError('Dangling node dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        # Convert the dangling dictionary into an array in nodelist order
        dangling_weights = np.array([dangling[n] for n in nodelist],
                                    dtype=float)
        dangling_weights /= dangling_weights.sum()
    dangling_nodes = np.where(M.sum(axis=1) == 0)[0]

    # Assign dangling_weights to any dangling nodes (nodes with no out links)
    for node in dangling_nodes:
        M[node] = dangling_weights

    
    M /= M.sum(axis=1)  # Normalize rows to sum to 1
    
    if item_mat is not None:
        sim_mat = mk_sim_mat(G, item_mat)

        M = beta * M + (1 - beta) * sim_mat
    
    return alpha * M + (1 - alpha) * p

In [24]:
def mk_sim_mat(G, item_mat):
    M = np.eye(len(G.nodes()))
    #M = np.eye(4)
    item_len = item_mat.shape[0]
    M[0:item_len, 0:item_len] = item_mat
    
    # RecWalk論文の定義
    M = M / np.max(M.sum(axis=1)) + np.diag(1 - M.sum(axis=1) / np.max(M.sum(axis=1)))
   
    return M

# sparse sim_matを作る

In [44]:
dataset = AmazonDataset('./data', model_name='TransE')

def mk_sparse_sim_mat(model):
    item_idx = torch.tensor([dataset.entity_list.index(i) for i in dataset.item_list], 
                        dtype=torch.long, device=device)

    user_idx = torch.tensor([dataset.entity_list.index(u) for u in dataset.user_list], 
                        dtype=torch.long, device=device)

    brand_idx = torch.tensor([dataset.entity_list.index(b) for b in dataset.brand_list], 
                        dtype=torch.long, device=device)
    
    # ここもっと上手く書きたい
    item_embed = model.entity_embed(item_idx)
    item_sim_mat = torch.mm(item_embed, torch.t(item_embed))
    item_sim_mat = scipy.sparse.csr_matrix(item_sim_mat.to('cpu').detach().numpy().copy())

    user_embed = model.entity_embed(user_idx)
    user_sim_mat = torch.mm(user_embed, torch.t(user_embed))
    user_sim_mat = scipy.sparse.csr_matrix(user_sim_mat.to('cpu').detach().numpy().copy())

    brand_embed = model.entity_embed(brand_idx)
    brand_sim_mat = torch.mm(brand_embed, torch.t(brand_embed))
    brand_sim_mat = scipy.sparse.csr_matrix(brand_sim_mat.to('cpu').detach().numpy().copy())

    M = scipy.sparse.block_diag((item_sim_mat, user_sim_mat, brand_sim_mat))
    M_ = np.array(1 - M.sum(axis=1) / np.max(M.sum(axis=1)))
                                    
    M = M / np.max(M.sum(axis=1)) + scipy.sparse.diags(M_.transpose()[0])
    #print(type(M))
    #print(M.shape)
    return M

In [46]:
mk_sparse_sim_mat(embed_model)

<5407x5407 sparse matrix of type '<class 'numpy.float64'>'
	with 17084371 stored elements in Compressed Sparse Row format>

In [10]:
def pagerank_numpy(G, item_mat, alpha=0.85, beta=0.01, personalization=None, weight='weight',
                   dangling=None):

    import numpy as np
    if len(G) == 0:
        return {}
    M = google_matrix(G, item_mat, alpha, beta, personalization=personalization,
                      weight=weight, dangling=dangling)
    #return 0
    # use numpy LAPACK solver
    eigenvalues, eigenvectors = np.linalg.eig(M.T)
    ind = eigenvalues.argsort()
    # eigenvector of largest eigenvalue at ind[-1], normalized
    largest = np.array(eigenvectors[:, ind[-1]]).flatten().real
    norm = float(largest.sum())
    return dict(zip(G, map(float, largest / norm)))


In [53]:
def pagerank_scipy(G, model, alpha=0.85, beta=0.01, personalization=None,
                   max_iter=100, tol=1.0e-6, weight='weight',
                   dangling=None):
    
    import scipy.sparse

    N = len(G)
    if N == 0:
        return {}

    nodelist = G.nodes()
    M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
                                  dtype=float)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    M = Q * M

    # initial vector
    x = scipy.repeat(1.0 / N, N)

    # Personalization vector
    if personalization is None:
        p = scipy.repeat(1.0 / N, N)
    else:
        missing = set(nodelist) - set(personalization)
        if missing:
            raise NetworkXError('Personalization vector dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        p = scipy.array([personalization[n] for n in nodelist],
                        dtype=float)
        p = p / p.sum()

    # Dangling nodes
    if dangling is None:
        dangling_weights = p
    else:
        missing = set(nodelist) - set(dangling)
        if missing:
            raise NetworkXError('Dangling node dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        # Convert the dangling dictionary into an array in nodelist order
        dangling_weights = scipy.array([dangling[n] for n in nodelist],
                                       dtype=float)
        dangling_weights /= dangling_weights.sum()
    is_dangling = scipy.where(S == 0)[0]

    
    # 遷移行列とsim_matを統合
    sim_mat = mk_sparse_sim_mat(model)
    M = beta * M + (1 - beta) * sim_mat

    #return 0
    # power iteration: make up to max_iter iterations
    for _ in range(max_iter):
        xlast = x
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
            (1 - alpha) * p
        # check convergence, l1 norm
        err = scipy.absolute(x - xlast).sum()
        if err < N * tol:
            return dict(zip(nodelist, map(float, x)))
    raise NetworkXError('pagerank_scipy: power iteration failed to converge '
                        'in %d iterations.' % max_iter)

In [54]:
val = np.zeros(len(G.nodes()))
val[user_idx[10]] = 1
k = [i for i in range(len(G.nodes()))]
personal_vec = dict(zip(k, val))
pagerank_scipy(G, model, alpha = 0.9, beta=0.01, personalization=personal_vec)

0

In [108]:
user_idx = [entity_list.index(u) for u in user_list]

def item_ppr(slim, user, alpha, beta):
    val = np.zeros(len(G.nodes()))
    val[user] = 1
    k = [i for i in range(len(G.nodes()))]
    personal_vec = dict(zip(k, val))
    #print(personal_vec)
    #pr = pagerank_numpy(G, slim.sim_mat, alpha, beta, personalization=personal)
    #pr = pagerank_scipy(G, slim.sim_mat, alpha, beta, personalization=personal)
    #return pr
    
    # random 後で消す
    # val = np.random.dirichlet([1 for i in range(len(G.nodes))], 1)[0]
    val = np.random.rand(len(G.nodes()))
    val /= val.sum()
    k = [i for i in range(len(G.nodes))]
    ppr = dict(zip(k, val))
    
    pred = []
    item_idx = [entity_list.index(i) for i in item_list]
    for i in item_idx:
        pred.append(ppr[i])
    
    return pred


def get_ranking_mat(slim, alpha=0.85, beta=0.01):
    ranking_mat = []
    count = 0
    for u in user_idx:
        pred = item_ppr(slim, u, alpha, beta)
        #print(pred)
        sorted_idx = np.argsort(np.array(pred))[::-1]
        ranking_mat.append(sorted_idx)
        
        #count += 1
        #if count > 100:
        #    break
            
    return ranking_mat

# Evaluate

In [13]:
user_idx = [entity_list.index(u) for u in user_list]
user_items_test_dict = pickle.load(open('./data/user_items_test_dict.pickle', 'rb'))

def topn_precision(ranking_mat, user_items_dict, n=10):
    not_count = 0
    precision_sum = 0
        
    for i in range(len(ranking_mat)):
        if len(user_items_dict[user_idx[i]]) == 0:
            not_count += 1
            continue
        sorted_idx = ranking_mat[i]
        topn_idx = sorted_idx[:n]  
        hit = len(set(topn_idx) & set(user_items_dict[user_idx[i]]))
        precision = hit / len(user_items_dict[user_idx[i]])
        precision_sum += precision
        
    return precision_sum / (len(user_idx) - not_count)

In [None]:
if __name__ == '__main__':
    gamma = 1e-4
    lin_model = 'lasso'
    slim = train_SLIM(lin_model, gamma)
    
    alpha = 0.85
    beta = 0.01
    
    ranking_mat = get_ranking_mat(slim, alpha, beta)
    score = topn_precision(ranking_mat, user_items_test_dict)
    print(score)

# Optuna

In [25]:
def objective(trial):
    gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3)
    lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])
    slim = train_SLIM(lin_model, gamma)
    
    alpha = trial.suggest_uniform('alpha', 0, 1)
    beta = trial.suggest_uniform('beta', 0, 0.5)
    
    ranking_mat = get_ranking_mat(slim, alpha, beta)
    score = topn_precision(ranking_mat, user_items_test_dict)
    return score

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=2)

0.005572710458254794
