In [1]:
import networkx as nx
from networkx.exception import NetworkXError

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse
import pickle

from SLIM_model import SLIM
import optuna
import time

import warnings
warnings.filterwarnings('ignore')

# データロード

In [2]:
slim_train = pd.read_csv('./data/user_item_train_slim.csv')
triplet_df = pd.read_csv('./data/triplet.csv')
edges = [[r[0], r[1]] for r in triplet_df.values]

user_list = []
item_list = []
entity_list = []
with open('./data/user_list.txt', 'r') as f:
    for l in f:
        user_list.append(l.replace('\n', ''))
with open('./data/item_list.txt', 'r') as f:
    for l in f:
        item_list.append(l.replace('\n', ''))
with open('./data/entity_list.txt', 'r') as f:
    for l in f:
        entity_list.append(l.replace('\n', ''))

# SLIMのハイパラをロードする
best_params = pickle.load(open('best_param.pickle', 'rb'))

In [3]:
best_params

{'alpha': 0.27243969881987223, 'l1_ratio': 0.7327114765483055}

In [14]:
# ハイパラ
# gamma
def train_SLIM(hyparam):
    # ハイパラロードもっと上手く書く
    alpha = hyparam['alpha']
    l1_ratio = hyparam['l1_ratio']
    #lin_model = hyparam['lin_model']
    slim = SLIM(alpha, l1_ratio, len(user_list), len(item_list), lin_model='elastic')
    #slim.fit_multi(slim_train)
    slim.load_sim_mat('./sim_mat.txt', slim_train)
    #slim.save_sim_mat('./sim_mat.txt')
    return slim

# PageRank

- sim_mat(item_len * item_len)を使って隣接行列を作る
- 隣接行列((item_len + user_len + brand_len) * (item_len + user_len + brand_len))
- nx.google_matrixを参考に隣接行列を作る  


danglingあんまり効果がわからないので注意

In [5]:
G = nx.DiGraph()
G.add_nodes_from([i for i in range(len(entity_list))])
G.add_edges_from(edges)

In [6]:
def google_matrix(G, item_mat=None, alpha=0.85, beta=0.01, personalization=None,
                  weight='weight', dangling=None):

    nodelist = G.nodes()

    M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight)
    N = len(G)
    if N == 0:
        return M

    # Personalization vector
    if personalization is None:
        p = np.repeat(1.0 / N, N)
    else:
        missing = set(nodelist) - set(personalization)
        if missing:
            raise NetworkXError('Personalization vector dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        p = np.array([personalization[n] for n in nodelist], dtype=float)
        p /= p.sum()

    #print(p)
    #print(p.shape)
        
    # Dangling nodes
    if dangling is None:
        dangling_weights = p
    else:
        missing = set(nodelist) - set(dangling)
        if missing:
            raise NetworkXError('Dangling node dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        # Convert the dangling dictionary into an array in nodelist order
        dangling_weights = np.array([dangling[n] for n in nodelist],
                                    dtype=float)
        dangling_weights /= dangling_weights.sum()
    dangling_nodes = np.where(M.sum(axis=1) == 0)[0]

    # Assign dangling_weights to any dangling nodes (nodes with no out links)
    for node in dangling_nodes:
        M[node] = dangling_weights

    
    M /= M.sum(axis=1)  # Normalize rows to sum to 1
    
    if item_mat is not None:
        sim_mat = mk_sim_mat(G, item_mat)

        M = beta * M + (1 - beta) * sim_mat
    
    return alpha * M + (1 - alpha) * p

In [7]:
def mk_sim_mat(G, item_mat):
    M = np.eye(len(G.nodes()))
    #M = np.eye(4)
    item_len = item_mat.shape[0]
    M[0:item_len, 0:item_len] = item_mat
    
    # RecWalk論文の定義
    M = M / np.max(M.sum(axis=1)) + np.diag(1 - M.sum(axis=1) / np.max(M.sum(axis=1)))
   
    return M


In [8]:
def mk_sparse_sim_mat(G, item_mat):
    item_mat = scipy.sparse.csr_matrix(item_mat)
    item_len = item_mat.shape[0]
    I = scipy.sparse.eye(len(G.nodes()) - item_len)
    
    M = scipy.sparse.block_diag((item_mat, I))
    #print(M)
    # RecWalk論文の定義
    M_ = np.array(1 - M.sum(axis=1) / np.max(M.sum(axis=1)))
                                    
    M = M / np.max(M.sum(axis=1)) + scipy.sparse.diags(M_.transpose()[0])
    #print(type(M))
    #print(M.shape)
    return M

In [9]:
def pagerank_numpy(G, item_mat, alpha=0.85, beta=0.01, personalization=None, weight='weight',
                   dangling=None):

    import numpy as np
    if len(G) == 0:
        return {}
    M = google_matrix(G, item_mat, alpha, beta, personalization=personalization,
                      weight=weight, dangling=dangling)
    #return 0
    # use numpy LAPACK solver
    eigenvalues, eigenvectors = np.linalg.eig(M.T)
    ind = eigenvalues.argsort()
    # eigenvector of largest eigenvalue at ind[-1], normalized
    largest = np.array(eigenvectors[:, ind[-1]]).flatten().real
    norm = float(largest.sum())
    return dict(zip(G, map(float, largest / norm)))


In [10]:
def pagerank_scipy(G, item_mat, alpha=0.85, beta=0.01, personalization=None,
                   max_iter=500, tol=1.0e-6, weight='weight',
                   dangling=None):
    
    import scipy.sparse

    N = len(G)
    if N == 0:
        return {}

    nodelist = G.nodes()
    M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
                                  dtype=float)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    M = Q * M

    # initial vector
    x = scipy.repeat(1.0 / N, N)

    # Personalization vector
    if personalization is None:
        p = scipy.repeat(1.0 / N, N)
    else:
        missing = set(nodelist) - set(personalization)
        if missing:
            raise NetworkXError('Personalization vector dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        p = scipy.array([personalization[n] for n in nodelist],
                        dtype=float)
        p = p / p.sum()

    # Dangling nodes
    if dangling is None:
        dangling_weights = p
    else:
        missing = set(nodelist) - set(dangling)
        if missing:
            raise NetworkXError('Dangling node dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        # Convert the dangling dictionary into an array in nodelist order
        dangling_weights = scipy.array([dangling[n] for n in nodelist],
                                       dtype=float)
        dangling_weights /= dangling_weights.sum()
    is_dangling = scipy.where(S == 0)[0]

    
    # 遷移行列とsim_matを統合
    sim_mat = mk_sparse_sim_mat(G, item_mat)
    M = beta * M + (1 - beta) * sim_mat


    # power iteration: make up to max_iter iterations
    for _ in range(max_iter):
        xlast = x
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
            (1 - alpha) * p
        # check convergence, l1 norm
        x = x / x.sum()
        err = scipy.absolute(x - xlast).sum()
        if err < N * tol:
            return dict(zip(nodelist, map(float, x)))
    # pagerankの収束ちゃんとやっとく
    print(x.sum())
    print(err)
    print(N * tol)
    
    #raise NetworkXError('pagerank_scipy: power iteration failed to converge '
                        #'in %d iterations.' % max_iter)
        
    return dict(zip(nodelist, map(float, x)))

In [None]:
val = np.zeros(len(G.nodes()))
val[user_idx[10]] = 1
k = [i for i in range(len(G.nodes()))]
personal_vec = dict(zip(k, val))
pagerank_scipy(G, slim.sim_mat, alpha = 0.9, beta=0.01, personalization=personal_vec)

In [11]:
user_idx = [entity_list.index(u) for u in user_list]

def item_ppr(slim, user, alpha, beta):
    val = np.zeros(len(G.nodes()))
    val[user] = 1
    k = [i for i in range(len(G.nodes()))]
    personal_vec = dict(zip(k, val))
    #print(personal_vec)
    #ppr = pagerank_numpy(G, slim.sim_mat, alpha, beta, personalization=personal_vec)
    ppr = pagerank_scipy(G, slim.sim_mat, alpha, beta, personalization=personal_vec)
    #return pr
    
    # random 後で消す
    # val = np.random.dirichlet([1 for i in range(len(G.nodes))], 1)[0]
    #val = np.random.rand(len(G.nodes()))
    #val /= val.sum()
    #k = [i for i in range(len(G.nodes))]
    #ppr = dict(zip(k, val))
    
    pred = []
    item_idx = [entity_list.index(i) for i in item_list]
    for i in item_idx:
        pred.append(ppr[i])
    
    return pred


def get_ranking_mat(slim, alpha=0.85, beta=0.01):
    ranking_mat = []
    count = 0
    for u in user_idx:
        pred = item_ppr(slim, u, alpha, beta)
        #print(pred)
        sorted_idx = np.argsort(np.array(pred))[::-1]
        ranking_mat.append(sorted_idx)
        
        #count += 1
        #if count > 100:
        #    break
            
    return ranking_mat

# Evaluate

In [12]:
user_idx = [entity_list.index(u) for u in user_list]
user_items_test_dict = pickle.load(open('./data/user_items_test_dict.pickle', 'rb'))

def topn_precision(ranking_mat, user_items_dict, n=10):
    not_count = 0
    precision_sum = 0
        
    for i in range(len(ranking_mat)):
        if len(user_items_dict[user_idx[i]]) == 0:
            not_count += 1
            continue
        sorted_idx = ranking_mat[i]
        topn_idx = sorted_idx[:n]  
        hit = len(set(topn_idx) & set(user_items_dict[user_idx[i]]))
        precision = hit / len(user_items_dict[user_idx[i]])
        precision_sum += precision
        
    return precision_sum / (len(user_idx) - not_count)

# Optuna

In [15]:
slim = train_SLIM(best_params)

In [16]:
def time_since(runtime):
    mi = int(runtime / 60)
    sec = int(runtime - mi * 60)
    return (mi, sec)

def objective(trial):
    start = time.time()
    # ハイパラ読み込み
    # gamma = trial.suggest_loguniform('gamma', 1e-6, 1e-3)
    # lin_model = trial.suggest_categorical('lin_model', ['lasso', 'elastic'])
    alpha = trial.suggest_uniform('alpha', 0, 1)
    beta = trial.suggest_uniform('beta', 0, 0.5)
    
    ranking_mat = get_ranking_mat(slim, alpha, beta)
    score = topn_precision(ranking_mat, user_items_test_dict)
    
    mi, sec = time_since(time.time() - start)
    print('{}m{}s'.format(mi, sec))
    
    return -1 * score

In [17]:
study = optuna.create_study()
study.optimize(objective, n_trials=20)

6m29s


[I 2020-07-05 18:51:16,996] Finished trial#0 with value: -0.36788709419779875 with parameters: {'alpha': 0.1686224152116137, 'beta': 0.19115335619954843}. Best is trial#0 with value: -0.36788709419779875.


6m28s


[I 2020-07-05 18:57:45,808] Finished trial#1 with value: -0.36962386174650635 with parameters: {'alpha': 0.1818487196534958, 'beta': 0.1505577632238277}. Best is trial#1 with value: -0.36962386174650635.


9.543986818995658e+25
1.0741760519098874e+25
0.0054069999999999995
9.877151991253e+25
1.112287467053712e+25
0.0054069999999999995
110.37565194907285
0.4360428064607512
0.0054069999999999995
9.42272070745384e+25
1.0603088949381968e+25
0.0054069999999999995
9.877227254019284e+25
1.1122960802896515e+25
0.0054069999999999995
9.877228032615093e+25
1.1122961694040566e+25
0.0054069999999999995
70749634875467.75
4272158695059.1006
0.0054069999999999995
7.277633323931568e+16
5373218623826882.0
0.0054069999999999995
1488807892281398.0
98775067073564.42
0.0054069999999999995
582118031183044.8
37588889913958.83
0.0054069999999999995
103848.64713386487
2054.748006959765
0.0054069999999999995
15060709892347.367
867359587675.5393
0.0054069999999999995
118686098962020.52
7327779720126.633
0.0054069999999999995
25.775411738441843
0.043810048755131906
0.0054069999999999995
1.0571041938317314e+16
742474089463849.8
0.0054069999999999995
9.235102681450199e+25
1.038860502059875e+25
0.0054069999999999995
1.7

KeyboardInterrupt: 

In [18]:
df = study.trials_dataframe() # pandasのDataFrame形式
df.to_csv('./hyparams_result.csv')

In [19]:
# save best params 
with open('best_param.pickle', 'wb') as f:
    pickle.dump(study.best_params, f)