In [None]:
import pickle
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device='cpu'

# Preprcess

In [None]:
# データ読み込み
user_item_df = pd.read_csv('../user_item.csv')
item_brand_df = pd.read_csv('../item_brand.csv')
item_buy_item_df = pd.read_csv('../item_buy_item.csv')
item_view_item_df = pd.read_csv('../item_view_item.csv')

In [None]:
entity_type = ['user', 'item', 'brand']
relation_type = ['u_buy_i', 'i_belong_b', 'i_also_buy_i', 'i_also_view_i']

In [None]:
# 各entity_typeのリストを作る
item_list = list(set(list(user_item_df['asin'])))
user_list = list(set(list(user_item_df['reviewerID'])))
brand_list = list(set(list(item_brand_df['brand'])))
# nanを除く
brand_list.pop(0)

print('item {}'.format(len(item_list)))
print('user {}'.format(len(user_list)))
print('brand {}'.format(len(brand_list)))

In [None]:
# 保存
with open('./data/user_list.txt', 'w') as f:
    for user in user_list:
        f.write(user + '\n')
with open('./data/item_list.txt', 'w') as f:
    for item in item_list:
        f.write(item + '\n')
with open('./data/brand_list.txt', 'w') as f:
    for brand in brand_list:
        f.write(brand + '\n')

In [None]:
# entityのリストを一つに連結する
# このリストを使ってentityのidxを管理
entity_list = item_list + user_list + brand_list
print('entity size: {}'.format(len(entity_list)))

In [None]:
# 保存
with open('./data/entity_list.txt', 'w') as f:
    for entity in entity_list:
        f.write(entity + '\n')

In [None]:
# テストデータとしてuser-itemインタラクションをスプリットする
user_item_df = user_item_df.take(np.random.permutation(len(user_item_df)))
train_num = int(0.5 * len(user_item_df))
user_item_train_df = user_item_df[0:train_num]
user_item_test_df = user_item_df[train_num:]

print('train {}'.format(train_num))
print('test {}'.format(len(user_item_test_df)))

In [None]:
# user_item_test_dfをID化する
user_item_test = []
for row in user_item_test_df.values:
    user = entity_list.index(row[0])
    item = entity_list.index(row[1])
    user_item_test.append([user, item, relation_type.index('u_buy_i')])
    
user_item_test_df = pd.DataFrame(user_item_test, columns = ['reviewerID', 'asin', 'relation'])

In [None]:
#保存
user_item_train_df.to_csv('./data/user_item_test.csv', index=False)

In [None]:
# 一つのtriplet dataframeを作る
# これが訓練データになる
# e_1, e_2, relation　が行
triplet_df = []
for row in user_item_train_df.values:
    user = entity_list.index(row[0])
    item = entity_list.index(row[1])
    triplet_df.append([user, item, relation_type.index('u_buy_i')])

for row in item_brand_df.values:
    if row[0] not in entity_list:
        continue
    if row[1] not in entity_list:
        continue
    item = entity_list.index(row[0])
    brand = entity_list.index(row[1])
    triplet_df.append([item, brand, relation_type.index('i_belong_b')])

In [None]:
for row in item_buy_item_df.values:
    if row[0] not in entity_list:
        continue
    item_id = entity_list.index(row[0])
    if type(row[1]) != str:
        continue
    also_i = row[1][1:-1].split(',')
    if len(also_i) == 0:
        continue
        
    for a_i in also_i:
        #print(a_i)
        if a_i[1:-1] not in entity_list: continue
        also_item_id = entity_list.index(a_i[1:-1])
        triplet_df.append([item_id, also_item_id, relation_type.index('i_also_buy_i')])

        
for row in item_view_item_df.values:
    if row[0] not in entity_list:
        continue
    item_id = entity_list.index(row[0])
    if type(row[1]) != str:
        continue
    also_i = row[1][1:-1].split(',')
    if len(also_i) == 0:
        continue
        
    for a_i in also_i:
        #print(a_i)
        if a_i[1:-1] not in entity_list: continue
        also_item_id = entity_list.index(a_i[1:-1])
        triplet_df.append([item_id, also_item_id, relation_type.index('i_also_view_i')])


In [None]:
triplet_df = pd.DataFrame(triplet_df, columns=['h_entity', 't_entity', 'relation'])

In [None]:
triplet_df.head()

In [None]:
#保存
triplet_df.to_csv('./data/triplet.csv', index=False)

In [None]:
# negative sampling
pos_triplet = [list(row) for row in triplet_df.values]
nega_triplet = []


count = 0
#while count < 100:
while count < len(triplet_df):
    h_entity = np.random.randint(len(entity_list))
    t_entity = np.random.randint(len(entity_list))
    relation = np.random.randint(len(relation_type))
    #print(h_entity, t_entity, relation)
    if [h_entity, t_entity, relation] in pos_triplet:
        continue
    if [h_entity, t_entity, relation] in nega_triplet:
        continue
    
    nega_triplet.append([h_entity, t_entity, relation])
    count += 1
    
nega_triplet_df = pd.DataFrame(nega_triplet, columns = ['h_entity', 't_entity', 'relation'])

In [None]:
#保存
nega_triplet_df.to_csv('./data/nega_triplet.csv', index=False)

In [None]:
# trainデータに対するtargetを作る
y_train = np.array([1 for i in range(len(triplet_df))] + [0 for i in range(len(nega_triplet_df))])

In [None]:
#保存
np.savetxt('./data/y_train.txt', y_train)

In [None]:
# データに含まれるuser-item1, item2, item3, ...を返す
# 辞書
def user_aggregate_item(df):
    user_items_dict = {}
    #for user in user_list:
    for i in range(len(item_list), len(item_list) + len(user_list)):
        items_df = df[df['reviewerID'] == i]
        user_items_dict[i] = list(items_df['asin'])
    return user_items_dict

user_items_test_dict = user_aggregate_item(user_item_test_df)

In [None]:
with open('./data/user_items_test_dict.pickle', 'wb') as f:
    pickle.dump(user_items_test_dict, f)

# Dataloader

In [None]:
class AmazonDataset:


    def __init__(self, data_dir):
        self.data_dir = data_dir
        if not self.data_dir.endswith('/'):
            self.data_dir += '/'

        self.load_triplet()
        self.load_user_items_dict()


    def load_triplet(self):
        self.user_item_test_df = pd.read_csv(self.data_dir + 'user_item_test.csv')
        self.triplet_df = pd.read_csv(self.data_dir + 'triplet.csv')
        self.nega_triplet_df = pd.read_csv(self.data_dir + 'nega_triplet.csv')

        self.user_list = []
        self.item_list = []
        self.brand_list = []
        self.entity_list = []
        with open(self.data_dir + 'user_list.txt', 'r') as f:
            for l in f:
                self.user_list.append(l.replace('\n', ''))

        with open(self.data_dir + 'item_list.txt', 'r') as f:
            for l in f:
                self.item_list.append(l.replace('\n', ''))
                
        with open(self.data_dir + 'brand_list.txt', 'r') as f:
            for l in f:
                self.brand_list.append(l.replace('\n', ''))
                
        with open(self.data_dir + 'entity_list.txt', 'r') as f:
            for l in f:
                self.entity_list.append(l.replace('\n', ''))

        self.y_train = np.loadtxt(self.data_dir + 'y_train.txt')
                
                
    def load_user_items_dict(self):
        self.user_items_test_dict = pickle.load(open(self.data_dir + 'user_items_test_dict.pickle', 'rb'))
       
    
    
    def get_batch(self, batch_size=2):
        train_num = len(self.triplet_df) + len(self.nega_triplet_df)
        batch_idx = np.random.permutation(train_num)[:batch_size]
        
        # posi_tripletとnega_tripletを連結
        batch = pd.concat([self.triplet_df, self.nega_triplet_df]).values[batch_idx]
        batch_y_train = self.y_train[batch_idx]
    
        return batch, batch_y_train

In [None]:
dataset = AmazonDataset('./data')

In [None]:
batch = dataset.get_batch(batch_size = 3)
batch

In [None]:
set(list(dataset.triplet_df['relation'].values))

# Model

In [None]:
class DistMulti(nn.Module):

    def __init__(self, embedding_dim, relation_size, entity_size):
        super(DistMulti, self).__init__()
        self.embedding_dim = embedding_dim

        self.entity_embed = nn.Embedding(entity_size, embedding_dim)
        self.relation_embed = nn.Embedding(relation_size, embedding_dim)
        
        
    def forward(self, head, tail, relation):
        head_embed = self.entity_embed(head)
        tail_embed = self.entity_embed(tail)
        relation_embed = self.relation_embed(relation)
        
        score = torch.sum(head_embed * tail_embed * relation_embed, axis=1)
        score = torch.sigmoid(score)
        
        return score
    
    def predict(self, user_tensor, item_tensor):
        return 0

In [None]:
model = DistMulti(3, 2, 3)

head = torch.tensor([0, 1], dtype=torch.long, device=device)
tail = torch.tensor([1, 2], dtype=torch.long, device=device)
relation = torch.tensor([0, 1], dtype=torch.long, device=device)

In [None]:
model(head, tail, relation)

# Training

In [None]:
class TrainIterater():


    def __init__(self, batch_size):
        #self.dataset = dataloader.AmazonDataset('./data')
        self.dataset = AmazonDataset('./data')
        self.batch_size = batch_size
        
        
    def train(self, batch, loss_func, optimizer, model):
        optimizer.zero_grad()

        triplet, y_train = batch
        h_entity_tensor = torch.tensor(triplet[:, 0], dtype=torch.long, device=device)
        t_entity_tensor = torch.tensor(triplet[:, 1], dtype=torch.long, device=device)
        relation_tensor = torch.tensor(triplet[:, 2], dtype=torch.long, device=device)
        y_train = torch.tensor(y_train, dtype=torch.float, device=device)
        
        pred = model(h_entity_tensor, t_entity_tensor, relation_tensor)
        loss = loss_func(pred, y_train)
        loss.backward()
        optimizer.step()

        return loss


    def iterate_train(self, model, lr=0.001, weight_decay=0, print_every=2000, plot_every=50):
        
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        # optimizer = optim.SGD(model.parameters(), lr=lr)

        loss_func = nn.BCELoss()

        print_loss_total = 0
        plot_loss_list = []
        plot_loss_total = 0

        train_num = len(self.dataset.triplet_df) + len(self.dataset.nega_triplet_df)
        start_time = time.time()
        
        for i in range(int(train_num / self.batch_size) + 1):
            batch = self.dataset.get_batch(batch_size=self.batch_size)

            loss = self.train(batch, loss_func, optimizer, model)

            print_loss_total += loss
            plot_loss_total += loss


            # print_everyごとに現在の平均のlossと、時間、dataset全体に対する進捗(%)を出力
            if (i+1) % print_every == 0:
                runtime = time.time() - start_time
                mi, sec = self.time_since(runtime)
                avg_loss = print_loss_total / print_every
                data_percent = int(i * self.batch_size / train_num * 100)
                print('train loss: {:e}    processed: {}({}%)    {}m{}sec'.format(
                    avg_loss, i*self.batch_size, data_percent, mi, sec))
                print_loss_total = 0

            # plot_everyごとplot用のlossをリストに記録しておく
            if (i+1) % plot_every == 0:
                avg_loss = plot_loss_total / plot_every
                plot_loss_list.append(avg_loss)
                plot_loss_total = 0
            
        return plot_loss_list
    
    
    def time_since(self, runtime):
        mi = int(runtime / 60)
        sec = int(runtime - mi * 60)
        return (mi, sec)
    

                
    def iterate_epoch(self, model, lr, epoch, weight_decay=0, warmup=0, lr_decay_rate=1, lr_decay_every=10, eval_every=5):
        eval_model = Evaluater()
        plot_loss_list = []
        plot_score_list = []
                          
        for i in range(epoch):
            plot_loss_list.extend(self.iterate_train(model, lr=lr, weight_decay=weight_decay, print_every=1e+5))
            
            # lrスケジューリング
            if i > warmup:
                if (i - warmup) % lr_decay_every == 0:
                    lr = lr * lr_decay_rate
            
            if (i+1) % eval_every == 0:
                score = eval_model.topn_precision(model)
                plot_score_list.append(score)
                print('epoch: {}  precision: {}'.format(i, score))
        
        self._plot(plot_loss_list)
        self._plot(plot_score_list)
        


    def _plot(self, loss_list):
        # ここもっとちゃんと書く
        plt.plot(loss_list)
        plt.show()

         
        
    


In [None]:
relation_size = len(set(list(dataset.triplet_df['relation'].values)))
entity_size = len(dataset.entity_list)
model = DistMulti(32, relation_size, entity_size)
iterater = TrainIterater(batch_size=128)
#iterater.iterate_train(model, print_every=10, plot_every=10)
iterater.iterate_epoch(model, lr=0.01, epoch=30)

# Evaluate

In [None]:
class Evaluater():


    def __init__(self):
        self.dataset = AmazonDataset('./data')

        
    def topn_precision(self, model, n=10):
        # user-itemの組に対して予測

        precision_sum = 0
        not_count = 0
        with torch.no_grad():

            batch_size = 512
            item_index = [dataset.entity_list.index(item) for item in dataset.item_list]
            user_index = [dataset.entity_list.index(user) for user in dataset.user_list]
            for i in user_index:
                if len(self.dataset.user_items_test_dict[i]) == 0:
                    not_count += 1
                    continue

                pred = torch.tensor([])
                for j in range(int(len(self.dataset.item_list) / batch_size) + 1):
                    # modelにuser,itemを入力
                    # batchでやると速い
                    user_tensor = torch.tensor([i for k in range(batch_size)], dtype=torch.long, device=device)
                    item_tensor = torch.tensor(item_index[j*batch_size : (j+1)*batch_size],
                                              dtype=torch.long, device=device)
                    ### user ->(buy) itemはrelationが1であることに注意 ###
                    relation_tensor = torch.tensor([1 for k in range(batch_size)], dtype=torch.long, device=device)
                    
                    if len(user_tensor) > len(item_tensor):
                        user_tensor = torch.tensor([i for k in range(len(item_tensor))],
                                               dtype=torch.long, device=device)
                        relation_tensor = torch.tensor([1 for k in range(len(item_tensor))],
                                                       dtype=torch.long, device=device)

                    pred = torch.cat([pred, model(user_tensor, item_tensor, relation_tensor)])

                # 予測をソート
                ### item_idxは0~len(item_list)-1 なのでこれでOK
                ### item_idxがentity_listの途中から始まっている場合は別
                sorted_idx = np.argsort(np.array(pred))[::-1]

                topn_idx = sorted_idx[:n]
                hit = len(set(topn_idx) & set(self.dataset.user_items_test_dict[i]))
                precision = hit / len(self.dataset.user_items_test_dict[i])
                precision_sum += precision

        return precision_sum / (len(self.dataset.user_list) - not_count)


    def topn_recall(self, model, n=10):
        return 0

In [None]:
for item in dataset.item_list[0]