In [None]:
import sys
import os
import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k , rmse , mae
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from sklearn.model_selection import train_test_split
#from recommenders.datasets.amazon_reviews import data_preprocessing
#from sklearn.preprocessing import LabelEncoder

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

In [None]:
# top k items to recommend
TOP_K = 20
#user_vocab = r'C:\Users\Abdo\Desktop\lightgcn\user_vocab.pkl' # the map file of user to id
#item_vocab = r'C:\Users\Abdo\Desktop\lightgcn\item_vocab.pkl' # the map file of item to id

# Select MovieLens data size: 100k, 1m, 10m, or 20m
#MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 10
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = r'C:\Users\Abdo\Desktop\lightgcn.yaml'
user_file = r'C:\Users\Abdo\Desktop\lightgcn\user_embeddings.csv'
item_file = r'C:\Users\Abdo\Desktop\lightgcn\item_embeddings.csv'



In [None]:
#df = movielens.load_pandas_df(size=MOVIELENS_DATA_SIZE)
df = pd.read_csv(r"C:\Users\Abdo\Desktop\both3.csv")
df.head()

In [None]:
#train, test = python_stratified_split(df, ratio=0.8 , )
train , test = train_test_split(df , test_size=0.2 , random_state=42 , shuffle=True)

In [None]:

data = ImplicitCF(train=train, test=test, seed=SEED)

In [None]:
hparams = prepare_hparams(yaml_file ,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.005,
                          eval_epoch=1,
                          top_k=TOP_K,
                          embed_size= 64,
                          #user_vocab=user_vocab,
                          #item_vocab=item_vocab
                         )

In [None]:
model = LightGCN(hparams, data, seed=SEED)

In [None]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

In [None]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True ,sort_top_k=True )
topk_scores

In [None]:
eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K )
eval_rmse = rmse(test ,topk_scores )
eval_mae = mae(test ,topk_scores )
print(#"MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      #"Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall,
     # "rmse:\t%f" % eval_rmse,
      #"mae:\t%f" % eval_mae,
      sep='\n')

In [None]:
# Record results with papermill for tests
sb.glue("map", eval_map)
sb.glue("ndcg", eval_ndcg)
sb.glue("precision", eval_precision)
sb.glue("recall", eval_recall)

In [None]:
model.infer_embedding(user_file, item_file)

In [None]:
model.recommend_k_items(test[test['userID']=='A2WA0LFIGKO34K'],top_k=15 )

# bert4rec model

In [None]:
from recbole.model.sequential_recommender import BERT4Rec 
import pandas as pd
from recbole.evaluator.metrics import RMSE , MAE , Hit , NDCG , Recall , MRR
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset , data_preparation
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
import numpy as np
import torch
from collections import defaultdict
from recbole.evaluator.metrics import Hit , NDCG , Recall
from recbole.quick_start import run_recbole , load_data_and_model

In [None]:
df = pd.read_csv(r"C:\Users\Abdo\Desktop\both3.csv")
df = df.sort_values(by=['timestamp'])

In [None]:
df1 = df.rename(columns={'userID': 'userID:token', 'itemID': 'itemID:token', 'rating': 'rating:token' ,'timestamp': 'timestamp:float'})

In [None]:
df1.to_csv(r'C:\Users\Abdo\Desktop\recbox_data\recbox_data.inter', index=False, sep='\t')

In [None]:
parameter_dict = {
    'data_path': r'C:\Users\Abdo\Desktop',
    'USER_ID_FIELD': 'userID',
    'ITEM_ID_FIELD': 'itemID',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': "[2,100000]",
    'item_inter_num_interval': "[1,100000]",
    'load_col': {'inter': ['userID', 'itemID' , 'rating', 'timestamp']},
    'unused_col':{'inter': ['rating']},
    'metrics': ['Recall','NDCG','Hit'],
    'topk'   : [1,5,10,20],
    'train_neg_sample_args':None,
    'seed':42,
    'train_batch_size': 1024,
    'eval_step': 1 ,                    # (int) The number of training epochs before an evaluation on the valid dataset.
    'n_layers': 2 ,                     # (int) The number of transformer layers in transformer encoder.
    'n_heads': 2  ,                    # (int) The number of attention heads for multi-head attention layer.
    'hidden_size': 64 ,                 # (int) The number of features in the hidden state.
    'inner_size': 256  ,               # (int) The inner hidden size in feed-forward layer.
    'hidden_dropout_prob': 0.2 ,        # (float) The probability of an element to be zeroed.
    'attn_dropout_prob': 0.2 ,          # (float) The probability of an attention score to be zeroed.
    'hidden_act': 'gelu' ,             # (str) The activation function in feed-forward layer.
    'layer_norm_eps': 1e-12 ,           # (float) A value added to the denominator for numerical stability.
    'initializer_range': 0.02 ,         # (float) The standard deviation for normal initialization.
    'mask_ratio': 0.2 ,                 # (float) The probability for a item replaced by MASK token.
    'loss_type': 'CE' ,                # (str) The type of loss function.
    'ft_ratio': 0.5   ,               # (float) The probability of generating fine-tuning samples
    'neg_sampling': None,
    'epochs': 30,
    'eval_args': {
        'split': {'LS': 'test_only'},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'uni100'
    }
}






config = Config(model= 'BERT4Rec', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)


In [None]:
dataset = create_dataset(config )
logger.info(dataset)

In [None]:
train_data, valid_data, test_data = data_preparation(config, dataset)

In [None]:
# model loading and initialization
model = BERT4Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

# trainer loading and initialization
trainer = Trainer(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data)

In [None]:
model.eval()

In [None]:
test_result = trainer.evaluate(test_data)
test_result

In [None]:
parameter_dict = [r'C:\Users\Abdo\Desktop\bert4rec.yaml']
run_recbole(model= 'BERT4Rec' , dataset = 'recbox_data', config_file_list= parameter_dict)

In [None]:
last_item_ids = df[df['userID'] == 'AVU1ILDDYW301'].tail(10).itemID.values
df[df['userID'] == 'AVU1ILDDYW301'].tail(10)

In [None]:
last_item_ids

In [None]:
v = test_data.dataset.token2id(test_data.dataset.uid_field,'AVU1ILDDYW301')
v

In [None]:
print(dataset.token2id(dataset.iid_field, last_item_ids))

In [None]:
# extract features and labels from testset
# itemID is the label
# itemID_list is the features
input_features = test_data.dataset[np.isin(test_data.dataset[test_data.dataset.uid_field].numpy(), [v])]
input_features

In [None]:
print("test label: " + str(input_features['itemID']))
print("last 10 items from origin dataset: " + str(dataset.token2id(dataset.iid_field, last_item_ids)))

In [None]:
# 37 label we want to predict
# every label we take before him 50 feature in order to predict it
#50 is fixed and if there is not enough data he will pad zeros

# For 1st row, it uses all items in training as input features.
# For 2nd row, it uses all items in training + first label as input features
# For 3rd row, it uses all items in training + first label + second label as input features
# ...
# For last row, it uses all items except last item as input features.


input_features['itemID_list']

# make dummy

In [None]:
#arr1 = model.ITEM_ID
#arr2 = np.array(model.item_embedding)

#dff3 = pd.DataFrame(arr, columns = ['itid:token','item_emb:float_seq'])

#dff3.to_csv(r'C:\Users\Abdo\Desktop\recbox_data\recbox_data.itememb', index=False, sep='\t')

#trainer = Trainer(config, dataset)
#trainer.fit(dataset)

In [None]:
model.item_embedding

In [None]:
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(model_file=r'C:\Users\Abdo\Desktop\recbox_data\saved\BERT4Rec-Mar-01-2023_02-42-35.pth',)

In [None]:
input_features['itemID_list'].shape

In [None]:
index = np.isin(dataset[dataset.uid_field].numpy(), [v])
input_interaction = dataset[index]
input_interaction

In [None]:
# from recbole.data.interaction import Interaction

# def add_last_item(old_interaction, last_item_id, max_len=50):
#     new_seq_items = old_interaction['itemID_list'][-1]
#     if old_interaction['item_length'][-1].item() < max_len:
#         new_seq_items[input_interaction['item_length'][-1].item()] = last_item_id
#     else:
#         new_seq_items = torch.roll(new_seq_items, -1)
#         new_seq_items[-1] = last_item_id
#     return new_seq_items.view(1, len(new_seq_items))

# test = {
#             'itemID_list': add_last_item(input_interaction, input_interaction['itemID'][-1].item(), model.max_seq_length),
#             'item_length': torch.tensor(
#                 [input_interaction['item_length'][-1].item() + 1
#                  if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
#         }
# new_inter = Interaction(test)
# new_inter

In [None]:
# new_inter = new_inter.to(config['device'])
# new_scores = model.full_sort_predict(new_inter)
# new_scores = new_scores.view(-1, test_data.dataset.item_num)
# new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf

In [None]:
#input_interaction['itemID'][-1].item()

In [None]:
#new_seq_items[input_interaction['item_length'][-1].item()] 

In [None]:
#new_seq_items = input_interaction['itemID_list'][-1]
#new_seq_items

In [None]:
#new_seq_items = torch.roll(new_seq_items, -1)
#new_seq_items

In [None]:
import torch
from recbole.data.interaction import Interaction

def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['itemID_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'itemID_list': add_last_item(input_interaction, 
                                          input_interaction['itemID'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 10)

In [None]:
values , indices = predict_for_all_item('AVU1ILDDYW301', dataset, model) # we feed directly origin dataset, not train data or test data

In [None]:
external_user_ids = dataset.id2token(
    dataset.uid_field, list(range(dataset.user_num)))[1:]#fist element in array is 'PAD'(default of Recbole) ->remove it 

In [None]:
topk_items = defaultdict(list)
for external_user_id in external_user_ids:
    _, topk_iid_list = predict_for_all_item(external_user_id, dataset, model)
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items[external_user_id].append(external_item_list)
print(len(topk_items))

In [None]:
topk_items['A1IU7S4HCK1XK0']