In [77]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from importlib import reload


import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'cpu'

# Model

In [78]:
from bpr_model import BPR

# Preprocess

In [79]:
user_item_df = pd.read_csv('../user_item.csv')
user_item_df.head()

Unnamed: 0,reviewerID,asin
0,A2HOI48JK8838M,B00004U9V2
1,A1YIPEY7HX73S7,B00004U9V2
2,A2QCGHIJ2TCLVP,B00004U9V2
3,A2R4UNHFJBA6PY,B00004U9V2
4,A2QCGHIJ2TCLVP,B00004U9V2


In [80]:
item_list = list(set(list(user_item_df['asin'])))
user_list = list(set(list(user_item_df['reviewerID'])))
user_num = len(user_list)
item_num = len(item_list)
print(len(item_list))
print(len(user_list))

1581
3819


In [81]:
# user, itemをID化する
user_item_list = []
for row in user_item_df.values:
    user = user_list.index(row[0])
    item = item_list.index(row[1])
    user_item_list.append([user, item])

user_item_df = pd.DataFrame(np.array(user_item_list), 
                            columns = ['reviewerID', 'asin'])

In [82]:
user_item_df.head()

Unnamed: 0,reviewerID,asin
0,1558,790
1,444,790
2,2359,790
3,3484,790
4,2359,790


In [83]:
user_item_df = user_item_df.take(np.random.permutation(len(user_item_df)))
train_num = int(0.5 * len(user_item_df))
user_item_train_df = user_item_df[0:train_num]
user_item_test_df = user_item_df[train_num:]

print('train {}'.format(train_num))
print('test {}'.format(len(user_item_test)))

train 17139
test 17139


In [84]:
%%time
# negative sampling
implicit_feed = [list(r) for r in user_item_df.values]
user_item_train_nega = []


count = 0
while count < 1000:
#while count < train_num:
    #user = user_list[np.random.randint(user_num)]
    #item = item_list[np.random.randint(item_num)]
    user = np.random.randint(user_num)
    item = np.random.randint(item_num)
    if [user, item] in implicit_feed:
        continue
    if [user, item] in user_item_train_nega:
        continue
    
    user_item_train_nega.append([user, item])
    count += 1

user_item_train_nega_df = pd.DataFrame(user_item_train_nega, columns=['reviewerID', 'asin'])
user_item_train_nega_df.head()

CPU times: user 10.9 s, sys: 228 ms, total: 11.1 s
Wall time: 11.4 s


In [86]:
# データに含まれるuser-item1, item2, item3, ...を返す
# 辞書
def user_aggregate_item(df):
    user_items_dict = {}
    #for user in user_list:
    for i in range(len(user_list)):
        items_df = df[df['reviewerID'] == i]
        user_items_dict[i] = list(items_df['asin'])
    return user_items_dict

user_items_nega_dict = user_aggregate_item(user_item_train_nega_df)
user_items_test_dict = user_aggregate_item(user_item_test_df)

# Evaluation

In [217]:
def topn_precision(model, user_items_dict, n=10):
    # user-itemの組に対して予測
    #for user in user_list:
    #    for item in item_list:
    precision_sum = 0
    not_count = 0
    with torch.no_grad():
        
        batch_size = 512
        
        item_index = [i for i in range(len(item_list))]
        for i in range(len(user_list)):
            if len(user_items_dict[i]) == 0:
                not_count += 1
                continue
                
            pred = torch.tensor([])
            
            for j in range(int(len(item_list) / batch_size) + 1):
                # modelにuser,itemを入力
                # batchでやると速い
                user_tensor = torch.tensor([i for k in range(batch_size)], dtype=torch.long, device=device)
                item_tensor = torch.tensor(item_index[j*batch_size : (j+1)*batch_size], 
                                          dtype=torch.long, device=device)
            
                if len(user_tensor) > len(item_tensor):
                    user_tensor = torch.tensor([i for k in range(len(item_tensor))], 
                                           dtype=torch.long, device=device)
                
                pred = torch.cat([pred, model.predict(user_tensor, item_tensor)])
                
            # 予測をソート
            sorted_idx = np.argsort(np.array(pred))[::-1]
            
            # topnにtarget userの推薦アイテムがいくつ含まれているか
            topn_idx = sorted_idx[:n]         
            hit = len(set(topn_idx) & set(user_items_dict[i]))
            precision = hit / len(user_items_dict[i])
            precision_sum += precision
                
    return precision_sum / (len(user_list) - not_count)
    
    
def topn_recall(n=10):
    return 0

In [179]:
bpr = BPR(2, len(user_list), len(item_list))

In [219]:
%time topn_precision(bpr, user_items_test_dict)

CPU times: user 3.2 s, sys: 14.5 ms, total: 3.21 s
Wall time: 3.24 s


0.012993807854078127

In [220]:
import evaluate

In [235]:
reload(evaluate)
eval_model = evaluate.Evaluater()

In [236]:
%time eval_model.topn_precision(bpr)

CPU times: user 3.29 s, sys: 31.9 ms, total: 3.32 s
Wall time: 3.4 s


0.005343115124556097

# Training

In [249]:
def train(batch, y_train, loss_func, optim, model):
    optim.zero_grad()
    
    posi_batch, nega_batch = batch
    user_tensor = torch.tensor(posi_batch[:, 0], dtype=torch.long, device=device)
    item_tensor = torch.tensor(posi_batch[:, 1], dtype=torch.long, device=device)
    nega_item_tensor = torch.tensor(nega_batch[:, 1], dtype=torch.long, device=device)
    
    pred = model(user_tensor, item_tensor, nega_item_tensor)
    #print(pred)
    loss = loss_func(pred, y_train)
    loss.backward()
    #print(loss)
    optim.step()
    
    return loss

In [247]:
def iterate_train(model, lr, batch_size=2, print_every=2000, plot_every=1000):
    # define optim
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # define loss_func
    # BPRの場合targetを全て1にする
    loss_func = nn.BCELoss()
    
    print_loss_total = 0
    plot_loss_list = []
    plot_loss_total = 0
    
    start_time = time.time()
    for i in range(int(train_num / batch_size) + 1):
        batch = get_batch(batch_size=batch_size)
        #batch = dataset.get_batch(batch_size=batch_size)
        
        # BPRなのでtargetは全部1
        y_train = torch.ones(batch_size, dtype=torch.float, device=device)
        
        loss = train(batch, y_train, loss_func, optimizer, model)
      
        print_loss_total += loss
        plot_loss_total += loss
        

        # print_everyごとに現在の平均のlossと、時間、dataset全体に対する進捗(%)を出力
        if i % print_every == 0:
            runtime = time.time() - start_time
            mi, sec = time_since(runtime)
            avg_loss = print_loss_total / print_every
            data_percent = int(i * batch_size / train_num * 100)
            print('train loss: {:e}    processed: {}({}%)    {}m{}sec'.format(
                avg_loss, i*batch_size, data_percent, mi, sec))
            print_loss_total = 0

        # plot_everyごとplot用のlossをリストに記録しておく
        if i % plot_every == 0:
            avg_loss = plot_loss_total / plot_every
            plot_loss_list.append(avg_loss)
            plot_loss_total = 0
            
def time_since(runtime):
    mi = int(runtime / 60)
    sec = int(runtime - mi * 60)
    return (mi, sec)

In [251]:
bpr = BPR(16, len(user_list), len(item_list))
score = topn_precision(bpr, user_items_test_dict)
print(score)
iterate_train(bpr, 0.001, batch_size=128)
print(topn_precision(bpr, user_items_test_dict))

0.005295880003578183
train loss: 1.126044e-03    processed: 0(0%)    0m0sec
0.005138043455325295


In [111]:
import training

In [116]:
reload(training)
iterater = training.TrainIterater(batch_size=3)

In [118]:
iterater.iterate_train(bpr, 0.001)

train loss: 4.300424e-04    processed: 0(0%)    0m0sec
train loss: 1.059903e+00    processed: 6000(35%)    0m2sec
train loss: 1.026572e+00    processed: 12000(70%)    0m4sec


# Dataload

In [73]:
#user_items_nega_dict = user_aggregate_item(user_item_train_nega)
def get_batch(batch_size=2):
    batch_idx = np.random.permutation(train_num)[:batch_size]
    batch = user_item_train.values[batch_idx]
    nega_batch = get_nega_batch(batch[:, 0])
    
    return batch, nega_batch

In [75]:
def get_nega_batch(users):
    nega_batch = []
    for user in users:
        nega_items = user_items_nega_dict[user]
        #print(nega_items)
        
        # ここ直す
        if len(nega_items) == 0:
            #nega_batch.append([user, item_list[np.random.randint(item_num)]])
            nega_batch.append([user, np.random.randint(item_num)])
            continue
        
        nega_item = nega_items[np.random.randint(len(nega_items))]
        nega_batch.append([user, nega_item])
    
    return np.array(nega_batch)

In [49]:
import dataloader

In [62]:
reload(dataloader)
dataset = dataloader.AmazonDataset('./data')

In [69]:
dataset.get_batch(batch_size=10)

(array([[2889,    6],
        [3704,  425],
        [ 724, 1532],
        [3632, 1525],
        [  49, 1256],
        [ 774,  471],
        [2932, 1083],
        [ 590,  425],
        [2867, 1409],
        [2975,  678]]), array([[2889,  483],
        [3704, 1026],
        [ 724,  261],
        [3632,  459],
        [  49, 1394],
        [ 774, 1458],
        [2932, 1256],
        [ 590, 1211],
        [2867,  680],
        [2975, 1206]]))