# RecBole実装練習
- RecBole走らせる
- RecBole上のグラフモデルを走らせる
- RecSysデータをRecBoleに適用させる準備

## todo
- 予測時のバグ
- sequential dataへの対応（trainにtestのユーザーidがない場合)
- データ削減→モデル比較の高速化

# RecBoleのインストールと実行確認

In [None]:
!pip install recbole

# 設定

In [None]:
import pandas as pd
import numpy as np
import torch

# 前処理

## データ読み込み

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/RecSys/'

In [None]:
#共通、ここからおすすめする
candidate_items = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/RecSys/data/candidate_items.csv')
item_features = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/RecSys/data/item_features.csv')
#セッション
train_sessions = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/RecSys/data/train_sessions.csv')
#セッションの結果買ったもの
train_purchases = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/RecSys/data/train_purchases.csv')
#リーダーボード
test_leaderboard_sessions = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/RecSys/data/test_leaderboard_sessions.csv')
#これを予測する
#test_final_sessions = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/RecSys/data/test_final_sessions.csv')

In [None]:
train_purchases.head()

Unnamed: 0,session_id,item_id,date
0,3,15085,2020-12-18 21:26:47.986
1,13,18626,2020-03-13 19:36:15.507
2,18,24911,2020-08-26 19:20:32.049
3,19,12534,2020-11-02 17:16:45.92
4,24,13226,2020-02-26 18:27:44.114


In [None]:
item_features.head()

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75


In [None]:
candidate_items.head()

Unnamed: 0,item_id
0,4
1,8
2,9
3,19
4,20


In [None]:
#test_final_sessions.head()

In [None]:
train_sessions.head()

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211


In [None]:
train_sessions['flg'] = 0 #買う前
train_purchases['flg'] = 1 #これを買った
master = pd.concat([train_sessions, train_purchases], axis=0)
master = master.sort_values(by=['session_id', 'date']).reset_index().drop(columns=['index'])
#master = pd.merge(master, item_features, on = 'item_id', how='left')
#item_idが一対一対応しない？

In [None]:
master

Unnamed: 0,session_id,item_id,date,flg
0,3,9655,2020-12-18 21:19:48.093,0
1,3,9655,2020-12-18 21:25:00.373,0
2,3,15085,2020-12-18 21:26:47.986,1
3,13,15654,2020-03-13 19:35:27.136,0
4,13,18626,2020-03-13 19:36:15.507,1
...,...,...,...,...
5743815,4440001,19539,2020-10-30 23:37:09.46,0
5743816,4440001,20409,2020-10-30 23:37:20.658,0
5743817,4440001,27852,2020-10-30 23:39:55.186,0
5743818,4440001,20449,2020-10-30 23:40:28.149,0


In [None]:
test_leaderboard_sessions

Unnamed: 0,session_id,item_id,date
0,26,19185,2021-06-16 09:53:54.158
1,200,17089,2021-06-25 12:23:40.811
2,200,17089,2021-06-25 12:24:36.631
3,200,8060,2021-06-25 12:24:41.677
4,200,4758,2021-06-25 12:24:50.692
...,...,...,...
229349,4439653,25955,2021-06-11 10:22:57.47
229350,4439653,12179,2021-06-11 10:23:00.663
229351,4439757,2078,2021-06-30 11:42:15.073
229352,4439757,2078,2021-06-30 11:43:13.725


# RecBole用のデータ作成
- https://techlife.cookpad.com/entry/2021/11/04/090000
- https://recbole.io/docs/get_started/quick_start.html#prepare-your-data
- https://qiita.com/fufufukakaka/items/77878c1e23338345d4fa#%E6%8E%A8%E8%96%A6%E3%83%A2%E3%83%87%E3%83%AB%E5%90%8C%E5%A3%AB%E3%81%8C%E3%81%A9%E3%81%AE%E7%A8%8B%E5%BA%A6%E4%BC%BC%E3%81%A6%E3%81%84%E3%82%8B%E3%81%AE%E3%81%8B%E3%82%92%E3%83%8D%E3%83%83%E3%83%88%E3%83%AF%E3%83%BC%E3%82%AF%E3%82%B0%E3%83%A9%E3%83%95%E3%81%A7%E5%8F%AF%E8%A6%96%E5%8C%96%E3%81%99%E3%82%8B


## メモ
- RecBoleで正解指定していない（クリックも購入も並列に扱われている）
- session_data用になっていない気がする
- ユーザー特徴量として組み込めばいいのか？

### Atomic Filesの作成

#### どのようなAtomic Filesを生成するか

|  Suffix | Content | Example Format |
| ---- | ---- | ---- |
| .inter | User-item interaction | user_id, item_id, rating, timestamp, review |
| .user | User feature | user_id, age, gender |
| .item | Item feature | item_id, category |
| .net | Social graph data | source, target |

- .inter
    - これが基本. ratingとreviewはないが, 複数回みているものや実際に買ったものに固有のratingをいれるのはアリかも
- .user
    - これはデフォルトではないが, 野中君がやってくれたようにユーザー特徴量を購買記録から抽出した場合に使える可能性があるので排除しない
- .item
    - これは使う. categoryが複数ある場合の入れ方などは要確認
- .net
    - これも使いたい. source:user, target:itemとすれば使えるはず

#### 各カラムの形式の確認

| feat_type | Explanations | Examples |
| ---- | ---- | ---- |
| token | single discrete feature | user_id, age |
| token_seq | discrete features sequence | review |
| float | single continuous feature | rating, timestamp |
| float_seq | continuous feature sequence | vector |


#### .interの作成

In [None]:
master_inter = master.copy()
master_inter['date'] = pd.to_datetime(master_inter['date']).map(pd.Timestamp.timestamp)
master_inter = master_inter.rename(columns={'session_id':'user_id:token', 'item_id':'item_id:token', 'date':'timestamp:float','flg':'flg:float'})
master_inter = master_inter.drop(columns=['flg:float'])
print(master_inter['user_id:token'].nunique())
master_inter.head()

1000000


Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,3,9655,1608326000.0
1,3,9655,1608327000.0
2,3,15085,1608327000.0
3,13,15654,1584128000.0
4,13,18626,1584128000.0


In [None]:
import random

user_list = list(master_inter['user_id:token'].unique())
user_list_selected = random.sample(user_list, 10000)

master_inter = master_inter[master_inter['user_id:token'].isin(user_list_selected)]

In [None]:
leader_inter = test_leaderboard_sessions.copy()
leader_inter['date'] = pd.to_datetime(leader_inter['date']).map(pd.Timestamp.timestamp)
leader_inter = leader_inter.rename(columns={'session_id':'user_id:token', 'item_id':'item_id:token', 'date':'timestamp:float'})
print(len(leader_inter), leader_inter['user_id:token'].nunique())
leader_inter.head()

229354 50000


Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,26,19185,1623837000.0
1,200,17089,1624624000.0
2,200,17089,1624624000.0
3,200,8060,1624624000.0
4,200,4758,1624624000.0


In [None]:
master_inter = pd.concat([master_inter, leader_inter], axis=0).sort_values(by=['user_id:token', 'timestamp:float'])

In [None]:
master_inter

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,26,19185,1.623837e+09
1,200,17089,1.624624e+09
2,200,17089,1.624624e+09
3,200,8060,1.624624e+09
4,200,4758,1.624624e+09
...,...,...,...
5743746,4439964,11397,1.585038e+09
5743747,4439964,20770,1.585038e+09
5743748,4439964,10093,1.585038e+09
5743749,4439964,12728,1.585038e+09


In [None]:
# RecBoleによる予測バグ回避のため, 1アイテムしかみていないsessionのときは1アイテムを2回みたことにして処理する
_tmp = master_inter[['user_id:token']].reset_index().groupby(by=['user_id:token']).count().reset_index()
one_list = list(_tmp[_tmp['index']==1]['user_id:token'])
master_inter = pd.concat([master_inter, master_inter[master_inter['user_id:token'].isin(one_list)]], axis=0).sort_values(by=['user_id:token', 'timestamp:float'])

In [None]:
master_inter.to_csv(path + 'data/recsys/recsys.inter', index=False, sep='\t')

In [None]:
#leader_inter.to_csv(path + 'data/recsys/recsys_leader.inter', index=False, sep='\t')

#### .userの作成
- 野中君のユーザー特徴量をうまく転用できたら追加

#### .itemの作成

In [None]:
master_item = item_features.copy()
master_item = master_item.rename(columns={'item_id':'item_id:token', 'feature_category_id':'feature_category_id:token','feature_value_id':'feature_value_id:token'})
master_item.head()

Unnamed: 0,item_id:token,feature_category_id:token,feature_value_id:token
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75


In [None]:
master_item.to_csv(path + 'data/recsys/recsys.item', index=False, sep='\t')

#### .netの作成

In [None]:
master_net = master_inter[['user_id:token', 'item_id:token']].copy().drop_duplicates()
master_net = master_net.rename(columns={'user_id:token':'source_id:token', 'item_id:token':'target_id:token'})
master_net.head()

Unnamed: 0,source_id:token,target_id:token
0,26,19185
1,200,17089
3,200,8060
4,200,4758
5,205,8194


In [None]:
master_net.to_csv(path + 'data/recsys/recsys.net', index=False, sep='\t')

# RecBole用のconfig設定

In [None]:
from logging import getLogger

from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.utils import init_seed, init_logger
from recbole.trainer import Trainer
from recbole.quick_start import run_recbole, load_data_and_model

from recbole.model.general_recommender import BPR, Pop, NGCF, ItemKNN
from recbole.model.sequential_recommender import GRU4Rec

In [None]:
config_dict = {
  # general
  'gpu_id': 0,
  'use_gpu': True,  # GPUを使う時はTRUEにする
  'seed': 28,
  'state': 'INFO',
  'reproducibility': True,
  'data_path': path + 'data/',  # 使うデータが格納されている場所
  'checkpoint_dir': path + 'saved/',  # モデル保存先
  'show_progress': True,
  'save_dataset': False,  # True にすればtrain, valid, test で使ったデータを保存してくれる
  'save_dataloaders': False,
  # Atomic File Format
  'field_separator': "\t",
  'seq_separator': "@", # 文字列があった場合この文字で区切られる。特徴量読み込み時にバグってしまう可能性があるため、できるだけデータを事前に処理しておき絶対に出現しない保障が取れている記号を書くべき(日本語の場合)
  
  # Common Features
  'USER_ID_FIELD': 'user_id',
  'ITEM_ID_FIELD': 'item_id',
  'RATING_FIELD': 'flg',  # implicit feedback の場合
  'TIME_FIELD': 'timestamp',

  """
  # Selectively Loading
  # 使うデータだけを選んで loadします
  load_col:
    inter: [user_id, item_id, timestamp]
    user: [user_id, feature1, feature2]
    item: [item_id, item_name, item_category_id]
    net: []
  unused_col:  # データとしては読み込むけど学習には使いたくないカラムはここで指定する
    inter: [timestamp]
  """

  # Training and evaluation config
  'epochs': 1,
  'stopping_step': 3,  # 10 step valid_metric が改善しない場合は止める
  'train_batch_size': 4096,
  'eval_batch_size': 4096,
  """
  'neg_sampling:  # implicit feedbackなデータを扱っていて positive,negative両方のラベルが必要な手法を試す際に、negative samplingすることでデータを用意できる
    uniform: 1
  """
  'eval_args': {
    'group_by': 'user',  # user 単位でアイテムを集約して評価に使う。基本的にこれ以外使うことはない
    'order': 'TO',  # Temporal Order。時系列順で train, valid, test を分けてくれる
    'split': {'RS': [0.8,0.1,0.1]},  # 80%, 10%, 10% で分けてくれる
    'mode': 'full'
  },
  'metrics': ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision'],
  'topk': 100,
  'valid_metric': 'MRR@100',  # この指標をtrackする
  'metric_decimal_place': 4,
}

# 大量モデルをまとめて実行

In [None]:
def runrun(model_name, dataset_name, config_dict):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "MacridVAE",
        "RecVAE",
        "GRU4Rec",
        "NARM",
        "STAMP",
        "NextItNet",
        "TransRec",
        "SASRec",
        "BERT4Rec",
        "SRGNN",
        "GCSAN",
        "GRU4RecF",
        "FOSSIL",
        "SHAN",
        "RepeatNet",
        "HRM",
        "NPE",
    ]:
        # これらは non-sampling method
        # https://recbole.io/docs/user_guide/model/general/macridvae.html などを参照
        config_dict["neg_sampling"] = None
        run_recbole(
            model=model_name,
            dataset=dataset_name,
            config_dict=config_dict,
        )
    else:
        config_dict["neg_sampling"] = "{'uniform':1}" 
        run_recbole(
            model=model_name, 
            dataset=dataset_name, 
            config_dict=config_dict
        )

In [None]:
#model_list = ["MultiVAE","BPR"]
model_list = ["RecVAE", "MacridVAE", "NAIS", "NNCF", "RepeatNet", 
              "NeuMF", "LINE", "BPR", "SHAN", "Item2vec", 
              "DGCF", "FFM", "FPMC", "NARM", "LightGCN", 
              "NGCF", "SASRec", "HRM", "EASE", "MultiVAE", 
              "NPE", "MultiDAE", "SRGNN", "ENMF", "DCN", 
              "FOSSIL", "ItemKNN", "DeepFM", "PNN", "FM", 
              "BERT4Rec", "xDeepFM", "NFM", "AutoInt", "AFM", 
              "FNN", "GRU4Rec", "SpectralCF", "WideDeep", "GCMC", 
              "DMF", "FwFM", "STAMP", "DSSM", "SLIMElastic", 
              "LR", "Pop", "CDAE"]
for aModel in model_list:
  print("=========================================================")
  print(aModel)
  runrun(model_name=aModel, dataset_name="recsys", config_dict=config_dict)

# 予測
- 今のモデルだとtrainに入っていないuserの予測はできない？
    - いったん学習データにleader_boardのsessionデータも追加して予測
    - 今後は[こちら](https://qiita.com/fufufukakaka/items/e03df3a7299b2b8f99cf)を参考に実装予定

In [None]:

config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
    model_file = path + '/saved/MultiVAE-May-21-2022_01-14-19.pth',
)
#trainer.evaluate(test_data)


In [None]:
from recbole.utils.case_study import full_sort_topk

external_user_ids = dataset.id2token(
    dataset.uid_field, list(range(dataset.user_num)))[1:]#first element in array is 'PAD'(default of Recbole) ->remove it 

In [None]:
# itemを1個しかみていないsessionの場合にtorch tensorのサイズが合わなくなるっぽい->atomic file生成時に個別対応で対処
topk_items = []
for internal_user_id in list(range(dataset.user_num))[1:]:
    _, topk_iid_list = full_sort_topk([internal_user_id], model, test_data, k=100, device=config['device'])
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)

print(len(topk_items))

1000


In [None]:
external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(external_user_ids, columns=['user_id'])
result['prediction'] = external_item_str
result.head()

Unnamed: 0,user_id,prediction
0,1951,21215 773 26402 24344 19150 23935 12933 12251 ...
1,2882,21353 12612 3173 15453 23689 2174 7466 17622 1...
2,8949,13269 17428 26249 18657 16195 25522 25529 2482...
3,21168,12251 9522 14378 7727 10991 6392 1018 8060 279...
4,22901,8060 9522 1644 3697 15816 26726 2174 1018 3062...


# 整形

## leaderboard用のsession_idに絞る

In [None]:
result = result[result['user_id'].astype(str).isin(list(test_leaderboard_sessions['session_id'].astype(str).unique()))]

## 出力用に整形する

In [None]:
res = pd.concat([result, result['prediction'].str.split(' ', expand=True)], axis=1).drop(columns=['prediction']).set_index('user_id').stack().reset_index()

In [None]:
res = res.rename(columns={'user_id':'session_id', 'level_1':'rank', 0:'item_id'})
res = res[['session_id','item_id','rank']]
res['rank'] = res['rank'] + 1

In [None]:
res

# 出力

In [None]:
res.to_csv('./sub.csv', index=False)