In [3]:
%cd SELFRec/

/workspace/VHAC-track-ds/SELFRec


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


# Import Lib

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from base.torch_interface import TorchGraphInterface
from base.graph_recommender import GraphRecommender
from data.loader import FileIO
from util.conf import ModelConf
from util.sampler import next_batch_pairwise
from util.loss_torch import bpr_loss, l2_reg_loss, InfoNCE

# Preprocess Data

In [6]:
df = pd.read_csv('../data/training_set.csv')
predict_df = pd.read_csv('../data/public_testset.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])

test_user_id = predict_df['user_id'].values

item_columns = predict_df.columns[1:]  # Lấy tất cả cột trừ cột uid
item_in_test_df = pd.unique(predict_df[item_columns].values.ravel())
#Lọc bỏ user_id bị lẫn trong tập test
user_list = df.UserId.unique()
item_in_test_df = list(set(item_in_test_df).difference(set(user_list)))

test_df = df[~df.ItemId.isin(item_in_test_df)].sample(frac=0.05, random_state=42)  # 90% for train
# train_df = df
train_df = df.drop(test_df.index).groupby('UserId').tail(20)  # Remaining 10% for test

train_df.shape, test_df.shape

((375372, 4), (14551, 4))

In [7]:
# Save train and test sets as .txt files without headers, separated by a space
train_df.to_csv("train.txt", index=False, header=False, sep=" ")
test_df.to_csv("test.txt", index=False, header=False, sep=" ")

In [8]:
df

Unnamed: 0,UserId,ItemId,Click,Purchase
0,tyviMi4b8Q,tyviMi4b8Q,1,0
1,tyviMi4b8Q,t3fB9Nq1VY,1,0
2,tyviMi4b8Q,X7ZdDwPBKM,1,0
3,tyviMi4b8Q,r4bieaZn66,1,0
4,tyviMi4b8Q,cRl2I3cVGB,1,0
...,...,...,...,...
389918,16jkTS9Vj2,5wWKMYcpiw,1,0
389919,16jkTS9Vj2,lHTwXgiMo7,1,0
389920,16jkTS9Vj2,ZBmAO23TI8,1,0
389921,16jkTS9Vj2,h9tLDUYcw6,1,0


In [15]:
user_map = {UserId: index for index, UserId in enumerate(user_list_denoise)}
user_map = pd.DataFrame(list(user_map.items()), columns=['UserId', 'index'])
user_map.head()

Unnamed: 0,UserId,index
0,G44YOVuyss,0
1,PMG5jGqKx0,1
2,TAOplc7QCE,2
3,RH44ej4IUR,3
4,N3Bg7PqmT4,4


In [16]:
item_list = df.ItemId.unique()
item_map = {ItemId:index for index, ItemId in enumerate(item_list)}
item_map = pd.DataFrame(list(item_map.items()), columns=['ItemId', 'index'])
item_map.head()

Unnamed: 0,ItemId,index
0,cRl2I3cVGB,0
1,0N6lG60HQq,1
2,OSxJGDgCe0,2
3,A5IrTeoxuI,3
4,NOxmzBDWzA,4


In [17]:
df['UserId'] = df.UserId.map(dict(user_map[['UserId', 'index']].values))
df['ItemId'] = df.ItemId.map(dict(item_map[['ItemId', 'index']].values))
df.head()

Unnamed: 0,UserId,ItemId,Click,Purchase
0,30504,0,1,0
1,30504,1,1,0
2,30504,2,1,0
3,30504,3,1,0
4,30504,4,1,0


In [18]:
df_by_user = df.groupby('UserId')['ItemId'].apply(list).reset_index()
df_by_user.head()

Unnamed: 0,UserId,ItemId
0,0,"[4754, 12283, 1744, 61315, 13834]"
1,1,"[2589, 367, 7845, 34501, 55607]"
2,2,"[1677, 468, 11939, 8500, 8781, 47834, 37349, 4..."
3,3,"[13030, 26968, 14389, 16387, 68546, 6199, 1544..."
4,4,"[6194, 5839, 6193, 5833, 3689]"


In [19]:
user_list_test = user_map.copy()
user_list_test
train_df = []
test_df = []
for _, row in df_by_user.iterrows():
    user_id = row['UserId']
    item_ids = row['ItemId']
    
    if user_id in user_list_test['index'].values:
        # Split last item into test_df and the rest into train_df
        train_df.append({'UserId': user_id, 'ItemId': item_ids[:-1]})
        test_df.append({'UserId': user_id, 'ItemId': [item_ids[-1]]})
    else:
        # All items go to train_df if user is not in user_list_test
        train_df.append({'UserId': user_id, 'ItemId': item_ids})

In [20]:
train_df = pd.DataFrame(train_df)
test_df = pd.DataFrame(test_df)

In [21]:

# train_df['ItemId'] = df_by_user['ItemId'][:-1] if df_by_user['UserId'] is in user_list_test
# train_df = df_by_user.sample(frac=0.95, random_state=42)  # 90% for train
# # train_df = df
# test_df = df_by_user.drop(train_df.index)

In [22]:
with open("train_sequential.txt", "w") as file:
    for _, row in train_df.iterrows():
        user_index = row['UserId']
        item_indices = ' '.join(map(str, row['ItemId']))
        file.write(f"{user_index}:{item_indices}\n")

In [23]:
with open("test_sequential.txt", "w") as file:
    for _, row in test_df.iterrows():
        user_index = row['UserId']
        item_indices = ' '.join(map(str, row['ItemId']))
        file.write(f"{user_index}:{item_indices}\n")

In [24]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "/kaggle/working/sample/train_sequential.txt",
    "test.set": "/kaggle/working/sample/test_sequential.txt",
    # "training.set": "./dataset/amazon-beauty/train.txt",
    # "test.set": "./dataset/amazon-beauty/test.txt",
    "model": {
        "name": "BERT4Rec",
        "type": "sequential"
    },
    "item.ranking.topN": [10, 20],
    "embedding.size": 128,
    "max.epoch": 50,
    "batch.size": 256,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,
    "max.len": 50,
    "SASRec": {
        "n_blocks":2,
        "drop_rate":0.2,
        "n_heads":1, 
        "mask_rate": 0.5
    },
    "output": "./results/"
}

# Save this dictionary as a YAML file
yaml_file_path = "config.yaml"
with open(yaml_file_path, "w") as file:
    yaml.dump(data, file, default_flow_style=False)

# Load the YAML file to verify content
with open(yaml_file_path, "r") as file:
    conf = yaml.safe_load(file)

print("Loaded data from YAML file:", conf)

Loaded data from YAML file: {'SASRec': {'drop_rate': 0.2, 'mask_rate': 0.5, 'n_blocks': 2, 'n_heads': 1}, 'batch.size': 256, 'embedding.size': 128, 'item.ranking.topN': [10, 20], 'learning.rate': 0.001, 'max.epoch': 50, 'max.len': 50, 'model': {'name': 'BERT4Rec', 'type': 'sequential'}, 'output': './results/', 'reg.lambda': 0.0001, 'test.set': '/kaggle/working/sample/test_sequential.txt', 'training.set': '/kaggle/working/sample/train_sequential.txt'}


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from base.seq_recommender import SequentialRecommender
from util.sampler import next_batch_sequence
from util.loss_torch import l2_reg_loss
from util.structure import PointWiseFeedForward
from math import floor
import random


# Paper: BERT4Rec: Sequential Recommendation with Bidirectional Encoder Representations from Transformer, CIKM'19

class BERT4Rec(SequentialRecommender):
    def __init__(self, conf, training_set, test_set):
        super(BERT4Rec, self).__init__(conf, training_set, test_set)
        args =self.config['BERT4Rec']
        block_num = int(args['n_blocks'])
        drop_rate = float(args['drop_rate'])
        head_num = int(args['n_heads'])
        self.aug_rate = float(args['mask_rate'])
        self.model = BERT_Encoder(self.data, self.emb_size, self.max_len, block_num,head_num,drop_rate)

    def train(self):
        model = self.model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lRate)
        for epoch in range(self.maxEpoch):
            model.train()
            #self.fast_evaluation(epoch)
            for n, batch in enumerate(next_batch_sequence(self.data, self.batch_size,max_len=self.max_len)):
                seq, pos, y, neg_idx, seq_len = batch
                aug_seq, masked, labels = self.item_mask_for_bert(seq, seq_len, self.aug_rate, self.data.item_num+1)
                seq_emb = model.forward(aug_seq, pos)
                # item mask
                rec_loss = self.calculate_loss(seq_emb,masked,labels)
                batch_loss = rec_loss+ l2_reg_loss(self.reg, model.item_emb)
                # Backward and optimize
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                if n % 50==0:
                    print('training:', epoch + 1, 'batch', n, 'batch_loss:', batch_loss.item(), 'rec_loss:', rec_loss.item())
            model.eval()
            self.fast_evaluation(epoch)

    def item_mask_for_bert(self,seq,seq_len, mask_ratio, mask_idx):
        augmented_seq = seq.copy()
        masked = np.zeros_like(augmented_seq)
        labels = []
        for i, s in enumerate(seq):
            to_be_masked = random.sample(range(seq_len[i]), max(floor(seq_len[i]*mask_ratio),1))
            masked[i, to_be_masked] = 1
            labels += list(augmented_seq[i, to_be_masked])
            augmented_seq[i, to_be_masked] = mask_idx
        return augmented_seq, masked, np.array(labels)

    def calculate_loss(self, seq_emb, masked, labels):
        seq_emb = seq_emb[masked>0].view(-1, self.emb_size)
        logits = torch.mm(seq_emb, self.model.item_emb.t())
        loss = F.cross_entropy(logits, torch.tensor(labels).to(torch.int64).cuda())/labels.shape[0]
        return loss

    def predict(self,seq, pos,seq_len):
        with torch.no_grad():
            for i,length in enumerate(seq_len):
                if length == self.max_len:
                    seq[i,:length-1] = seq[i,1:]
                    pos[i,:length-1] = pos[i,1:]
                    pos[i, length-1] = length
                    seq[i, length-1] = self.data.item_num+1
                else:
                    pos[i, length] = length+1
                    seq[i,length] = self.data.item_num+1
            seq_emb = self.model.forward(seq,pos)
            last_item_embeddings = [seq_emb[i,last-1,:].view(-1,self.emb_size) for i,last in enumerate(seq_len)]
            score = torch.matmul(torch.cat(last_item_embeddings,0), self.model.item_emb.transpose(0, 1))
        return score.cpu().numpy()

class BERT_Encoder(nn.Module):
    def __init__(self, data, emb_size, max_len, n_blocks, n_heads, drop_rate):
        super(BERT_Encoder, self).__init__()
        self.data = data
        self.emb_size = emb_size
        self.block_num = n_blocks
        self.head_num = n_heads
        self.drop_rate = drop_rate
        self.max_len = max_len
        self._init_model()

    def _init_model(self):
        initializer = nn.init.xavier_uniform_
        self.item_emb = nn.Parameter(initializer(torch.empty(self.data.item_num+2, self.emb_size)))
        self.pos_emb = nn.Parameter(initializer(torch.empty(self.max_len+2, self.emb_size)))
        self.attention_layer_norms = torch.nn.ModuleList()
        self.attention_layers = torch.nn.ModuleList()
        self.forward_layer_norms = torch.nn.ModuleList()
        self.forward_layers = torch.nn.ModuleList()
        self.emb_dropout = torch.nn.Dropout(self.drop_rate)
        self.last_layer_norm = torch.nn.LayerNorm(self.emb_size, eps=1e-8)

        for n in range(self.block_num):
            self.attention_layer_norms.append(torch.nn.LayerNorm(self.emb_size, eps=1e-8))
            new_attn_layer =  torch.nn.MultiheadAttention(self.emb_size, self.head_num, self.drop_rate)
            self.attention_layers.append(new_attn_layer)
            self.forward_layer_norms.append(torch.nn.LayerNorm(self.emb_size, eps=1e-8))
            new_fwd_layer = PointWiseFeedForward(self.emb_size, self.drop_rate,'gelu')
            self.forward_layers.append(new_fwd_layer)

    def forward(self, seq, pos):
        seq_emb = self.item_emb[seq]
        seq_emb *= self.emb_size ** 0.5
        pos_emb = self.pos_emb[pos]
        seq_emb += pos_emb
        seq_emb = self.emb_dropout(seq_emb)
        timeline_mask = torch.BoolTensor(seq == 0).cuda()
        seq_emb *= ~timeline_mask.unsqueeze(-1)
        # tl = seq_emb.shape[1]
        # attention_mask = ~torch.tril(torch.ones((tl, tl), dtype=torch.bool).cuda())
        for i in range(len(self.attention_layers)):
            seq_emb = torch.transpose(seq_emb, 0, 1)
            normalized_emb = self.attention_layer_norms[i](seq_emb)
            mha_outputs, _ = self.attention_layers[i](normalized_emb, seq_emb, seq_emb, attn_mask=None)
            seq_emb = normalized_emb + mha_outputs
            seq_emb = torch.transpose(seq_emb, 0, 1)
            seq_emb = self.forward_layer_norms[i](seq_emb)
            seq_emb = self.forward_layers[i](seq_emb)
            seq_emb *= ~timeline_mask.unsqueeze(-1)
        seq_emb = self.last_layer_norm(seq_emb)
        return seq_emb

In [26]:
from data.loader import FileIO


class SELFRec(object):
    def __init__(self, config):
        self.social_data = []
        self.feature_data = []
        self.config = config
        self.training_data = FileIO.load_data_set(config['training.set'], config['model']['type'])
        self.test_data = FileIO.load_data_set(config['test.set'], config['model']['type'])

        self.kwargs = {}
#         if config.contain('social.data'):
#             social_data = FileIO.load_social_data(self.config['social.data'])
#             self.kwargs['social.data'] = social_data
        # if config.contains('feature.data'):
        #     self.social_data = FileIO.loadFeature(config,self.config['feature.data'])
        print('Reading data and preprocessing...')

    def execute(self):
        # import the model module
#         import_str = f"from model.{self.config['model']['type']}.{self.config['model']['name']} import {self.config['model']['name']}"
#         exec(import_str)
        recommender = f"{self.config['model']['name']}(self.config,self.training_data,self.test_data,**self.kwargs)"
        return eval(recommender)

In [27]:
model = 'BERT4Rec'
conf = ModelConf('/kaggle/working/sample/config.yaml')
BERT4Rec = SELFRec(conf).execute()

Config file is not found!


OSError: 

In [None]:
BERT4Rec.execute()

In [None]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "/kaggle/working/sample/train_sequential.txt",
    "test.set": "/kaggle/working/sample/test_sequential.txt",
    # "training.set": "./dataset/amazon-beauty/train.txt",
    # "test.set": "./dataset/amazon-beauty/test.txt",
    "model": {
        "name": "SASRec",
        "type": "sequential"
    },
    "item.ranking.topN": [10, 20],
    "embedding.size": 64,
    "max.epoch": 50,
    "batch.size": 256,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,
    "max.len": 50,
    "SASRec": {
        "n_blocks":2,
        "drop_rate":0.2,
        "n_heads":1
    },
    "output": "./results/"
}

# Save this dictionary as a YAML file
yaml_file_path = "config.yaml"
with open(yaml_file_path, "w") as file:
    yaml.dump(data, file, default_flow_style=False)

# Load the YAML file to verify content
with open(yaml_file_path, "r") as file:
    conf = yaml.safe_load(file)

print("Loaded data from YAML file:", conf)

In [None]:
import torch
import torch.nn as nn
import numpy as np
from base.seq_recommender import SequentialRecommender
from util.sampler import next_batch_sequence
from util.structure import PointWiseFeedForward
from util.loss_torch import l2_reg_loss


# Paper: Self-Attentive Sequential Recommendation
# Code Referred: https://github.com/pmixer/SASRec.pytorch/


class SASRec(SequentialRecommender):
    def __init__(self, conf, training_set, test_set):
        super(SASRec, self).__init__(conf, training_set, test_set)
        args = self.config['SASRec']
        block_num = int(args['n_blocks'])
        drop_rate = float(args['drop_rate'])
        head_num = int(args['n_heads'])
        self.model = SASRec_Model(self.data, self.emb_size, self.max_len, block_num,head_num,drop_rate)
        self.rec_loss = torch.nn.BCEWithLogitsLoss()

    def train(self):
        model = self.model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lRate)
        for epoch in range(self.maxEpoch):
            model.train()
            #self.fast_evaluation(epoch)
            for n, batch in enumerate(next_batch_sequence(self.data, self.batch_size,max_len=self.max_len)):
                seq, pos, y, neg_idx, _ = batch
                seq_emb = model.forward(seq, pos)
                rec_loss = self.calculate_loss(seq_emb, y, neg_idx, pos)
                batch_loss = rec_loss+ l2_reg_loss(self.reg, model.item_emb)
                # Backward and optimize
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                if n % 50==0:
                    print('training:', epoch + 1, 'batch', n, 'rec_loss:', batch_loss.item())
            model.eval()
            self.fast_evaluation(epoch)

    def calculate_loss(self, seq_emb, y, neg,pos):
        y_emb = self.model.item_emb[y]
        neg_emb = self.model.item_emb[neg]
        pos_logits = (seq_emb * y_emb).sum(dim=-1)
        neg_logits = (seq_emb * neg_emb).sum(dim=-1)
        pos_labels, neg_labels = torch.ones(pos_logits.shape).cuda(), torch.zeros(neg_logits.shape).cuda()
        indices = np.where(pos != 0)
        loss = self.rec_loss(pos_logits[indices], pos_labels[indices])
        loss += self.rec_loss(neg_logits[indices], neg_labels[indices])
        return loss

    def predict(self,seq, pos,seq_len):
        with torch.no_grad():
            seq_emb = self.model.forward(seq,pos)
            last_item_embeddings = [seq_emb[i,last-1,:].view(-1,self.emb_size) for i,last in enumerate(seq_len)]
            score = torch.matmul(torch.cat(last_item_embeddings,0), self.model.item_emb.transpose(0, 1))
        return score.cpu().numpy()


class SASRec_Model(nn.Module):
    def __init__(self, data, emb_size, max_len, n_blocks, n_heads, drop_rate):
        super(SASRec_Model, self).__init__()
        self.data = data
        self.emb_size = emb_size
        self.block_num = n_blocks
        self.head_num = n_heads
        self.drop_rate = drop_rate
        self.max_len = max_len
        self._init_model()

    def _init_model(self):
        initializer = nn.init.xavier_uniform_
        self.item_emb = nn.Parameter(initializer(torch.empty(self.data.item_num+1, self.emb_size)))
        self.pos_emb = nn.Parameter(initializer(torch.empty(self.max_len+1, self.emb_size)))
        self.attention_layer_norms = torch.nn.ModuleList()
        self.attention_layers = torch.nn.ModuleList()
        self.forward_layer_norms = torch.nn.ModuleList()
        self.forward_layers = torch.nn.ModuleList()
        self.emb_dropout = torch.nn.Dropout(self.drop_rate)
        self.last_layer_norm = torch.nn.LayerNorm(self.emb_size, eps=1e-8)

        for n in range(self.block_num):
            self.attention_layer_norms.append(torch.nn.LayerNorm(self.emb_size, eps=1e-8))
            new_attn_layer =  torch.nn.MultiheadAttention(self.emb_size, self.head_num, self.drop_rate)
            self.attention_layers.append(new_attn_layer)
            self.forward_layer_norms.append(torch.nn.LayerNorm(self.emb_size, eps=1e-8))
            new_fwd_layer = PointWiseFeedForward(self.emb_size, self.drop_rate)
            self.forward_layers.append(new_fwd_layer)

    def forward(self, seq, pos):
        seq_emb = self.item_emb[seq]
        seq_emb *= self.emb_size ** 0.5
        pos_emb = self.pos_emb[pos]
        seq_emb += pos_emb
        seq_emb = self.emb_dropout(seq_emb)
        timeline_mask = torch.BoolTensor(seq == 0).cuda()
        seq_emb *= ~timeline_mask.unsqueeze(-1)
        tl = seq_emb.shape[1]
        attention_mask = ~torch.tril(torch.ones((tl, tl), dtype=torch.bool).cuda())
        for i in range(len(self.attention_layers)):
            seq_emb = torch.transpose(seq_emb, 0, 1)
            #attention_input = seq_emb
            normalized_emb = self.attention_layer_norms[i](seq_emb)
            mha_outputs, _ = self.attention_layers[i](normalized_emb, seq_emb, seq_emb, attn_mask=attention_mask)
            seq_emb = normalized_emb + mha_outputs
            seq_emb = torch.transpose(seq_emb, 0, 1)
            seq_emb = self.forward_layer_norms[i](seq_emb)
            seq_emb = self.forward_layers[i](seq_emb)
            seq_emb *=  ~timeline_mask.unsqueeze(-1)
        seq_emb = self.last_layer_norm(seq_emb)
        return seq_emb

In [None]:
from data.loader import FileIO


class SELFRec(object):
    def __init__(self, config):
        self.social_data = []
        self.feature_data = []
        self.config = config
        self.training_data = FileIO.load_data_set(config['training.set'], config['model']['type'])
        self.test_data = FileIO.load_data_set(config['test.set'], config['model']['type'])

        self.kwargs = {}
#         if config.contain('social.data'):
#             social_data = FileIO.load_social_data(self.config['social.data'])
#             self.kwargs['social.data'] = social_data
        # if config.contains('feature.data'):
        #     self.social_data = FileIO.loadFeature(config,self.config['feature.data'])
        print('Reading data and preprocessing...')

    def execute(self):
        # import the model module
#         import_str = f"from model.{self.config['model']['type']}.{self.config['model']['name']} import {self.config['model']['name']}"
#         exec(import_str)
        recommender = f"{self.config['model']['name']}(self.config,self.training_data,self.test_data,**self.kwargs)"
        return eval(recommender)

In [None]:
model = 'SASRec'
conf = ModelConf('/kaggle/working/sample/config.yaml')
SASRec = SELFRec(conf).execute()

In [None]:
SASRec.execute()

In [None]:
predict_df = pd.read_csv('/kaggle/input/vhac-recsys/public_testset.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
predict_df.head()


In [None]:
# # rec = XSimGCL
# from util.algorithm import find_k_largest
# def test(self):
#     def process_bar(num, total):
#         rate = float(num) / total
#         ratenum = int(50 * rate)
#         print(f'\rProgress: [{"+" * ratenum}{" " * (50 - ratenum)}]{ratenum * 2}%', end='', flush=True)

#     rec_list = {}
#     data_train = pd.DataFrame(self.data.training_data, columns= ['uid', 'iid', 'rating'])
#     self.data.train_set = data_train[data_train['uid'].isin(test_user_id)].values.tolist()
#     user_count = len(self.data.train_set)
    
#     for i, user in enumerate(self.data.train_set):
#         user = user[0]
#         candidates = self.predict(user)
#         rated_list, _ = self.data.user_rated(user)
#         for item in rated_list:
#             candidates[self.data.item[item]] = -10e8
#         # ids, scores = find_k_largest(1000, candidates)
#         item_names = predict_df[predict_df.user_id == user].values[0][1:]
#         scores = []
#         for item in item_names:
#             try:
#                 id_tmp = self.data.item[item]
#                 scores.append(candidates[id_tmp])
#             except:
#                 # Cần sửa khuyến nghị cold start
#                 scores.append(0)
        
#         sorted_list = sorted(list(zip(item_names, scores)), key=lambda x: x[1], reverse=True)
#         rec_list[user] = sorted_list
#         if i % 1000 == 0:
#             process_bar(i, user_count)
#     process_bar(user_count, user_count)
#     print('')
#     return rec_list
    
# rec_list = test(XSimGCL)

In [None]:
from base.recommender import Recommender
from data.sequence import Sequence
from util.algorithm import find_k_largest
from util.evaluation import ranking_evaluation
from util.sampler import next_batch_sequence_for_test

def test(self):
    def process_bar(num, total):
        rate = float(num) / total
        ratenum = int(50 * rate)
        print(f'\rProgress: [{"+" * ratenum}{" " * (50 - ratenum)}]{ratenum * 2}%', end='', flush=True)

    rec_list = {}
    for n, batch in enumerate(next_batch_sequence_for_test(self.data, self.batch_size, max_len=self.max_len)):
        seq, pos, seq_len = batch
        seq_start = n * self.batch_size
        seq_end = (n + 1) * self.batch_size
        seq_names = [seq_full[0] for seq_full in self.data.original_seq[seq_start:seq_end]]
        candidates = self.predict(seq, pos, seq_len)
        for name, res in zip(seq_names, candidates):
            ids, scores = find_k_largest(1000, res)
            
            item_names = [self.data.id2item[iid] for iid in ids if iid != 0 and iid <= self.data.item_num]
            rec_list[name] = list(zip(item_names, scores))
        if n % 100 == 0:
            process_bar(n, self.data.raw_seq_num / self.batch_size)
    process_bar(self.data.raw_seq_num, self.data.raw_seq_num)
    print('')
    return rec_list
rec_list = test(SASRec)

In [None]:
user_id = 'NcVPv81tzp'
str(user_map[user_map['UserId']==user_id]['index'].values[0])

In [None]:
items = ['9307', '791', '17778', '3542', '5114', '22446', '10432', '2426', '987', '195', '266', '3311', '14310', '903', '2538', '31311', '3384', '33927', '7016', '11233']
item_map[item_map['index']==int(items[0])]['ItemId'].values[0]

In [None]:
user_id = test_user_id[0]
user_index = str(user_map[user_map['UserId']==user_id]['index'].values[0])
x = rec_list[user_index]
index = str(item_map[item_map['ItemId']=='IQYqanXAvK']['index'].values[0])
y = x[x[0] == index][1]
y

In [None]:
data = []
for user_id in test_user_id:
    user_index = str(user_map[user_map['UserId']==user_id]['index'].values[0])
    item_names = predict_df[predict_df.user_id == user_id].values[0][1:]
    candidates = rec_list[user_index]
    candidate_index = [candidate[0] for candidate in candidates]
    # items = []
    scores = []
    # items = [item[0] for item in rec_list[user_index]]
    # items = [item_map[item_map['index']==int(item)]['ItemId'].values[0] for item in items]
    
    for item in item_names:
        try: 
            item_index = str(item_map[item_map['ItemId'] == item]['index'].values[0])
            if item_index in candidate_index:
                scores.append(candidates[candidates[0] == item_index][1])
            else:
                scores.append(1e-8)
        except:
            scores.append(1e-8)
        
    # for item, score in candidates:
    #     item = item_map[item_map['index']==int(item)]['ItemId'].values[0]
    #     if item not in item_names:
    #         score = 1e-8    
    #     items.append(item)
    #     scores.append(score)
        
    sorted_list = sorted(list(zip(item_names, scores)), key=lambda x: x[1], reverse=True)
#         rec_list[user] = sorted_list
    # print(sorted_list)
    # data.append([user_id] + sorted_list)
    data.append([user_id] + [i[0] for i in sorted_list])
    # print(data)
    # break

In [None]:
import numpy as np
from tqdm import tqdm
# Precompute mappings for faster lookups
user_map_dict = user_map.set_index('UserId')['index'].astype(str).to_dict()
item_map_dict = item_map.set_index('ItemId')['index'].astype(str).to_dict()

data = []  # Result storage

for user_id in tqdm(test_user_id):
    # Look up user index using the precomputed dictionary
    user_index = user_map_dict.get(user_id)
    if not user_index:
        continue  # Skip if user index is not found

    # Retrieve item names and candidate pairs for this user
    item_names = predict_df[predict_df.user_id == user_id].values[0][1:]
    candidates = rec_list[user_index]
    candidate_index = {str(candidate[0]): candidate[1] for candidate in candidates}  # Map item index to score

    # Create score list for item_names by checking candidate_index
    scores = [
        candidate_index.get(item_map_dict.get(item), 1e-8) if item in item_map_dict else 1e-8
        for item in item_names
    ]
    
    # Sort items based on score in descending order
    sorted_list = sorted(zip(item_names, scores), key=lambda x: x[1], reverse=True)
    
    # Append user_id and sorted items to data
    data.append([user_id] + [i[0] for i in sorted_list])

#     print(data)
#     break

In [None]:
pd.DataFrame(data).to_csv('predict_SAS.csv', index = False, header=False)

In [None]:
pd.DataFrame(data).head()

In [None]:
from IPython.display import FileLink 
FileLink(r'/kaggle/working/sample/predict_SAS.csv')

In [19]:
predict_df = pd.read_csv('../data/public_testset.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
predict_df.head()

Unnamed: 0,user_id,item_id_1,item_id_2,item_id_3,item_id_4,item_id_5,item_id_6,item_id_7,item_id_8,item_id_9,...,item_id_991,item_id_992,item_id_993,item_id_994,item_id_995,item_id_996,item_id_997,item_id_998,item_id_999,item_id_1000
0,NcVPv81tzp,NYRp1o1YKD,nyDJiBSznn,tMMpf5S62m,pYCyajy1Am,IQYqanXAvK,K0nTlcclVU,aTcR4pfQrM,md0Mi2Y3PN,BGpHuuj0eD,...,Xi7bQ3D5Nd,DnHOgmVXrS,6Uns6g05f7,20kEvp8BVk,vVjXZVhRDe,Z2h9Oni93D,uHuaWLAHdv,RYpgK2XiNS,cFfT0AEdOb,WCm27PLRAp
1,C5ihgxLcrb,1bWoiCKvUG,uBZeAeJAp1,wz3fiRi1XX,kcLQeDE9Y9,XoyZaOJ2hQ,cfkOAeEV6p,lywvWPArBx,41uJKXzbXd,1kVkVTBnhQ,...,j387AnkQH6,HkvDRz6oXT,fSv7hmSvI4,LHh3iij8he,mf2eKsaafi,rKwmw637oi,nXcPqalddS,CiGuUIbCEO,5QBxhJ93BT,bo1CjxPHpP
2,ljMBWT9UXW,1M0GtQfkUA,iytV2FTVjK,G2Sufv4xlY,AS24sgQN69,U1uDxeAwr4,yniLMM6GB7,ziFGlTdzPp,BddfNX8HRd,S9tgPjbv1a,...,VDBauWNjJF,NSMr3KjVnR,aJUzJBXEfY,tqC65yziRM,4zDIIH126P,M1tDsFuWBH,PTvaMJlQyK,HfvagfNvGp,XUWoQVdsmN,EdwIvwMRoZ
3,WZXCQtOWJt,2J7erZG9Af,S0Q4XoexaS,hCZTh4mNx8,dUXbCsX8et,mA7s8zgFMR,YFdqrb8OGd,C9FNEwKKd8,mtsRoBIDbs,1b49EFCcJG,...,dwVDUM6ngC,IFwPdofoEP,sFGjZgyZnH,F10yUBudDF,XK550LREfU,3uZFqvw0zz,Dto8H87MIx,tiQGe5Ye7v,CzHVYocfQY,wc7rJoeZRQ
4,xlB2rvpQKu,fZIHEceEkK,FurfEc4uKO,4FoN0bcH4x,cslhbAk2FI,Wi8OmeoDhH,uHdZ7W1MOy,xqyiojw3KO,Hj1UDRyfAc,vSeyk7gadz,...,2IpMrt6ieN,odYnPAZSJv,5FV9Qd8ABf,se19TPTKGt,tXTRyaQqsA,8KqODVTepQ,1iqudkzZxj,H7ZyQBRUQq,0NDBaPifhZ,u0altqehZS


In [20]:
# rec = XSimGCL
from util.algorithm import find_k_largest
def test(self):
    def process_bar(num, total):
        rate = float(num) / total
        ratenum = int(50 * rate)
        print(f'\rProgress: [{"+" * ratenum}{" " * (50 - ratenum)}]{ratenum * 2}%', end='', flush=True)

    rec_list = {}
    data_train = pd.DataFrame(self.data.training_data, columns= ['uid', 'iid', 'rating'])
    self.data.train_set = data_train[data_train['uid'].isin(test_user_id)].values.tolist()
    user_count = len(self.data.train_set)
    
    for i, user in enumerate(self.data.train_set):
        user = user[0]
        candidates = self.predict(user)
        rated_list, _ = self.data.user_rated(user)
        for item in rated_list:
            candidates[self.data.item[item]] = -10e8
        # ids, scores = find_k_largest(1000, candidates)
        item_names = predict_df[predict_df.user_id == user].values[0][1:]
        scores = []
        for item in item_names:
            try:
                id_tmp = self.data.item[item]
                scores.append(candidates[id_tmp])
            except:
                # Cần sửa khuyến nghị cold start
                scores.append(0)
        
        sorted_list = sorted(list(zip(item_names, scores)), key=lambda x: x[1], reverse=True)
        rec_list[user] = sorted_list
        if i % 1000 == 0:
            process_bar(i, user_count)
    process_bar(user_count, user_count)
    print('')
    return rec_list
    
rec_list = test(rec)

Progress: [++++++++++++++++++++++++++++++++++++++++++++++++++]100%


In [23]:
data = []
for user_id in test_user_id:
    data.append([user_id] + [i[0] for i in rec_list[user_id]])

In [24]:
pd.DataFrame(data).to_csv('../runs/simgcl_final_data/predict.csv', index = False, header=False)

In [19]:
pd.DataFrame(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,1000
0,NcVPv81tzp,mwOc6vrVPi,oNn8dJXhM8,av2irHMO5f,NZ6ZRExaU5,wvulby9Vbm,IzLVUKs7uQ,lS7aepxu9O,95vIbbmBfu,1qeuDBCyxi,...,dLwFHGHtdv,dc6cO5BULj,RHb5SNK7zK,Pnr2iO61A7,7E8otbRXlX,E10wOdLIAT,dH19PTV6Jl,eLPGZAbPBo,CzFLPHqKk0,Zn8Fq0tFmC
1,C5ihgxLcrb,6jy1DwPo7L,pUUgxbyvTC,f1dROAFg11,WjZUWFn3s9,kk3smPWhZf,u0KbwAhFgw,O5nMvWF9dA,41uJKXzbXd,3wBtYUn27K,...,SCGRmTW1Wj,UJIWN83FpE,5HVET1lIrc,HlFW2LD25j,5Y0zIMvK8G,bE2kBREPHh,azEzhk6eHL,PjHKY7afPo,OcsxmB3gEE,PfQbz0wOPX
2,ljMBWT9UXW,gKUj4wrdaF,Hf6NKeCrqS,qwL2TeeksN,pRQEBDA4WJ,eHslZ2QF3q,U9Leg1KZce,qfMR0pa5jB,I575KzrWvJ,ABKeMXJAqU,...,DC0latQPNG,I93CDmdKET,agHKmDMeVB,H2fl09Q0Qu,Zf2Cq8y3ZT,UMZmNCdpOh,xQMipz6UAT,oURQhu3On4,MtDmLkYlDi,XmDINMQQg1
3,WZXCQtOWJt,V6S3fAvyI9,dQoIYlX2wJ,04zWH3pkfR,rGAG9phqUn,xbfyEc4bwt,3LWXR1uoiB,QUYCjkVSVR,Pm6sxfSbjW,SzPkpQ35GK,...,TkQR3N0QUS,MRslj3nrZb,YHTUOJAXav,vOo5dADm09,bDHaQjazq6,MLyaPYLicR,x1ykeOR3DW,pOXFMQbFqv,kk3smPWhZf,cFMFQCkWWJ
4,xlB2rvpQKu,TNCS6O3iSq,6s5PvCvsdC,HUTuM2kdCi,49BAzOjiJz,2Mqq3N13HR,asNrzYAAKy,hsIVBBv8ch,TBJg2ARZDr,r5ERTKY2E1,...,AHG0qYYdTB,s7GrD6m2D6,Cr2ep1s4YE,7kg8LNWZYA,3QoCBriOGk,sTdWl5NgBY,ocv4aRZDB1,7gZEpavqfO,fgONK18lzp,GuqfIY4OQL


In [25]:
!cd ../runs/simgcl_final_data/ && zip predict_simgcl_final_data.zip predict.csv

  adding: predict.csv (deflated 27%)


# XSimGCL

In [None]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "./train.txt",
    "test.set": "./test.txt",
    "model": {
        "name": "XSimGCL",
        "type": "graph"
    },
    "item.ranking.topN": [10, 20],
    "embedding.size": 1024,
    "max.epoch": 30,
    "batch.size": 2048,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,
    "XSimGCL": {
        "n_layer": 2,
        "l_star": 1,
        "lambda": 0.2,
        "eps": 0.2,
        "tau": 0.15
    },
    "output": "./results/"
}

# Save this dictionary as a YAML file
yaml_file_path = "config.yaml"
with open(yaml_file_path, "w") as file:
    yaml.dump(data, file, default_flow_style=False)

# Load the YAML file to verify content
with open(yaml_file_path, "r") as file:
    conf = yaml.safe_load(file)

print("Loaded data from YAML file:", conf)

In [None]:
model = 'XSimGCL'
conf = ModelConf('/kaggle/working/sample/config.yaml')
rec = SELFRec(conf).execute()

In [None]:
rec.execute()

# LightGCN

In [9]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "./train.txt",
    "test.set": "./test.txt",
    "model": {
        "name": "LightGCN",
        "type": "graph"
    },
    "item.ranking.topN": [10, 20],
    "embedding.size": 1024,
    "max.epoch": 100,
    "batch.size": 2048,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,
    "LightGCN": {
        "n_layer": 2
    },
    "output": "./results/"
}

# Save this dictionary as a YAML file
yaml_file_path = "config.yaml"
with open(yaml_file_path, "w") as file:
    yaml.dump(data, file, default_flow_style=False)

# Load the YAML file to verify content
with open(yaml_file_path, "r") as file:
    conf = yaml.safe_load(file)

print("Loaded data from YAML file:", conf)

Loaded data from YAML file: {'LightGCN': {'n_layer': 2}, 'batch.size': 2048, 'embedding.size': 1024, 'item.ranking.topN': [10, 20], 'learning.rate': 0.001, 'max.epoch': 100, 'model': {'name': 'LightGCN', 'type': 'graph'}, 'output': './results/', 'reg.lambda': 0.0001, 'test.set': './test.txt', 'training.set': './train.txt'}


In [10]:
from data.loader import FileIO


class SELFRec(object):
    def __init__(self, config):
        self.social_data = []
        self.feature_data = []
        self.config = config
        self.training_data = FileIO.load_data_set(config['training.set'], config['model']['type'])
        self.test_data = FileIO.load_data_set(config['test.set'], config['model']['type'])

        self.kwargs = {}
#         if config.contain('social.data'):
#             social_data = FileIO.load_social_data(self.config['social.data'])
#             self.kwargs['social.data'] = social_data
        # if config.contains('feature.data'):
        #     self.social_data = FileIO.loadFeature(config,self.config['feature.data'])
        print('Reading data and preprocessing...')

    def execute(self):
        # import the model module
#         import_str = f"from model.{self.config['model']['type']}.{self.config['model']['name']} import {self.config['model']['name']}"
#         exec(import_str)
        recommender = f"{self.config['model']['name']}(self.config,self.training_data,self.test_data,**self.kwargs)"
        return eval(recommender)

In [11]:
import torch
import torch.nn as nn
from base.graph_recommender import GraphRecommender
from util.sampler import next_batch_pairwise
from base.torch_interface import TorchGraphInterface
from util.loss_torch import bpr_loss,l2_reg_loss
# paper: LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation. SIGIR'20


class LightGCN(GraphRecommender):
    def __init__(self, conf, training_set, test_set):
        super(LightGCN, self).__init__(conf, training_set, test_set)
        args = self.config['LightGCN']
        self.n_layers = int(args['n_layer'])
        self.model = LGCN_Encoder(self.data, self.emb_size, self.n_layers)

    def train(self):
        model = self.model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lRate)
        for epoch in range(self.maxEpoch):
            for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)):
                user_idx, pos_idx, neg_idx = batch
                rec_user_emb, rec_item_emb = model()
                user_emb, pos_item_emb, neg_item_emb = rec_user_emb[user_idx], rec_item_emb[pos_idx], rec_item_emb[neg_idx]
                batch_loss = bpr_loss(user_emb, pos_item_emb, neg_item_emb) + l2_reg_loss(self.reg, model.embedding_dict['user_emb'][user_idx],model.embedding_dict['item_emb'][pos_idx],model.embedding_dict['item_emb'][neg_idx])/self.batch_size
                # Backward and optimize
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                if n % 100==0 and n>0:
                    print('training:', epoch + 1, 'batch', n, 'batch_loss:', batch_loss.item())
            with torch.no_grad():
                self.user_emb, self.item_emb = model()
            if epoch % 5 == 0:
                self.fast_evaluation(epoch)
        self.user_emb, self.item_emb = self.best_user_emb, self.best_item_emb



    def save(self):
        with torch.no_grad():
            self.best_user_emb, self.best_item_emb = self.model.forward()

    def predict(self, u):
        u = self.data.get_user_id(u)
        score = torch.matmul(self.user_emb[u], self.item_emb.transpose(0, 1))
        return score.cpu().numpy()


class LGCN_Encoder(nn.Module):
    def __init__(self, data, emb_size, n_layers):
        super(LGCN_Encoder, self).__init__()
        self.data = data
        self.latent_size = emb_size
        self.layers = n_layers
        self.norm_adj = data.norm_adj
        self.embedding_dict = self._init_model()
        self.sparse_norm_adj = TorchGraphInterface.convert_sparse_mat_to_tensor(self.norm_adj).cuda()

    def _init_model(self):
        initializer = nn.init.xavier_uniform_
        embedding_dict = nn.ParameterDict({
            'user_emb': nn.Parameter(initializer(torch.empty(self.data.user_num, self.latent_size))),
            'item_emb': nn.Parameter(initializer(torch.empty(self.data.item_num, self.latent_size))),
        })
        return embedding_dict

    def forward(self):
        ego_embeddings = torch.cat([self.embedding_dict['user_emb'], self.embedding_dict['item_emb']], 0)
        all_embeddings = [ego_embeddings]
        for k in range(self.layers):
            ego_embeddings = torch.sparse.mm(self.sparse_norm_adj, ego_embeddings)
            all_embeddings += [ego_embeddings]
        all_embeddings = torch.stack(all_embeddings, dim=1)
        all_embeddings = torch.mean(all_embeddings, dim=1)
        user_all_embeddings = all_embeddings[:self.data.user_num]
        item_all_embeddings = all_embeddings[self.data.user_num:]
        return user_all_embeddings, item_all_embeddings



In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [13]:
model = 'LightGCN'
conf = ModelConf('./config.yaml')
rec = SELFRec(conf).execute()

Reading data and preprocessing...


  return torch.sparse.FloatTensor(i, v, coo.shape)


In [14]:
rec.execute()

Model: LightGCN
Training Set: /workspace/viettel-ai-challenge-track-ds/SELFRec/train.txt
Test Set: /workspace/viettel-ai-challenge-track-ds/SELFRec/test.txt
Embedding Dimension: 1024
Maximum Epoch: 100
Learning Rate: 0.001
Batch Size: 2048
Regularization Parameter: 0.0001
Specific parameters: n_layer:2
Training Set Size: (user number: 36751, item number: 82673, interaction number: 375372)
Test Set Size: (user number: 9370, item number: 9256, interaction number: 14551)
Initializing and building model...
Training Model...
training: 1 batch 100 batch_loss: 0.38305675983428955
Evaluating the model...
Progress: [++++++++++++++++++++++++++++++++++++++++++++++++++]100%
--------------------------------------------------------------------------------
Real-Time Ranking Performance (Top-20 Item Recommendation)
*Current Performance*
Epoch: 1, Hit Ratio: 0.08671, Precision: 0.00653, Recall: 0.10335, NDCG: 0.04769
*Best Performance*
Epoch: 1, Hit Ratio: 0.08671, Precision: 0.00653, Recall: 0.10335, 

In [15]:
rec.save()

In [16]:
import os
os.makedirs("../runs/lightgcn_private_data")

In [17]:
import pickle
with open(f"../runs/lightgcn_private_data/model.pkl", "wb") as f:
    pickle.dump(rec, f)

In [18]:
import pickle
emb_user = dict([(k, v) for k, v in zip(rec.data.user.keys(), rec.best_user_emb.cpu().numpy())])
with open(f"../runs/lightgcn_private_data/user_embedding.pkl", "wb") as f:
    pickle.dump(emb_user, f)

In [19]:
import pickle
emb_item = dict([(k, v) for k, v in zip(rec.data.item.keys(), rec.best_item_emb.cpu().numpy())])
with open(f"../runs/lightgcn_private_data/item_embedding.pkl", "wb") as f:
    pickle.dump(emb_item, f)

# SSL4Rec


In [None]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "/kaggle/working/sample/train.txt",
    "test.set": "/kaggle/working/sample/test.txt",
    "model":{
      "name": "SSL4Rec",
      "type": "graph"
    },

    "item.ranking.topN": [20],

    "embedding.size": 256,
    "max.epoch": 100,
    "batch.size": 2048,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,

    "SSL4Rec":{
      "tau": 0.07,
      "alpha": 0.1,
      "drop": 0.1
    },

    "output": "./results/"
}

# Save this dictionary as a YAML file
yaml_file_path = "config.yaml"
with open(yaml_file_path, "w") as file:
    yaml.dump(data, file, default_flow_style=False)

# Load the YAML file to verify content
with open(yaml_file_path, "r") as file:
    conf = yaml.safe_load(file)

print("Loaded data from YAML file:", conf)

In [None]:
from data.loader import FileIO


class SELFRec(object):
    def __init__(self, config):
        self.social_data = []
        self.feature_data = []
        self.config = config
        self.training_data = FileIO.load_data_set(config['training.set'], config['model']['type'])
        self.test_data = FileIO.load_data_set(config['test.set'], config['model']['type'])

        self.kwargs = {}
#         if config.contain('social.data'):
#             social_data = FileIO.load_social_data(self.config['social.data'])
#             self.kwargs['social.data'] = social_data
        # if config.contains('feature.data'):
        #     self.social_data = FileIO.loadFeature(config,self.config['feature.data'])
        print('Reading data and preprocessing...')

    def execute(self):
        # import the model module
#         import_str = f"from model.{self.config['model']['type']}.{self.config['model']['name']} import {self.config['model']['name']}"
#         exec(import_str)
        recommender = f"{self.config['model']['name']}(self.config,self.training_data,self.test_data,**self.kwargs)"
        return eval(recommender)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from base.graph_recommender import GraphRecommender
from util.sampler import next_batch_pairwise
from util.loss_torch import l2_reg_loss, InfoNCE, batch_softmax_loss

# Paper: Self-supervised Learning for Large-scale Item Recommendations. CIKM'21

""" 
Note: This version of code conducts feature dropout on the item embeddings 
because items features are not always available in many academic datasets.
"""


class SSL4Rec(GraphRecommender):
    def __init__(self, conf, training_set, test_set):
        super(SSL4Rec, self).__init__(conf, training_set, test_set)
        args = self.config['SSL4Rec']
        self.cl_rate = float(args['alpha'])
        self.tau = float(args['tau'])
        self.drop_rate = float(args['drop'])
        self.model = DNN_Encoder(self.data, self.emb_size, self.drop_rate, self.tau)

    def train(self):
        model = self.model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lRate)
        for epoch in range(self.maxEpoch):
            for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)):
                query_idx, item_idx, _neg = batch
                model.train()
                query_emb, item_emb = model(query_idx, item_idx)
                rec_loss = batch_softmax_loss(query_emb, item_emb, self.tau)
                cl_loss = self.cl_rate * model.cal_cl_loss(item_idx)
                batch_loss = rec_loss + l2_reg_loss(self.reg, query_emb, item_emb) + cl_loss
                # Backward and optimize
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                if n % 100 == 0:
                    print('training:', epoch + 1, 'batch', n, 'rec_loss:', rec_loss.item(), 'cl_loss', cl_loss.item())
            model.eval()
            with torch.no_grad():
                self.query_emb, self.item_emb = self.model(list(range(self.data.user_num)),list(range(self.data.item_num)))
            self.fast_evaluation(epoch)
        self.query_emb, self.item_emb = self.best_query_emb, self.best_item_emb

    def save(self):
        with torch.no_grad():
            self.best_query_emb, self.best_item_emb = self.model.forward(list(range(self.data.user_num)),list(range(self.data.item_num)))

    def predict(self, u):
        u = self.data.get_user_id(u)
        score = torch.matmul(self.query_emb[u], self.item_emb.transpose(0, 1))
        return score.cpu().numpy()


class DNN_Encoder(nn.Module):
    def __init__(self, data, emb_size, drop_rate, temperature):
        super(DNN_Encoder, self).__init__()
        self.data = data
        self.emb_size = emb_size
        self.tau = temperature
        self.user_tower = nn.Sequential(
            nn.Linear(self.emb_size, 1024),
            nn.ReLU(True),
            nn.Linear(1024, 128),
            nn.Tanh()
        )
        self.item_tower = nn.Sequential(
            nn.Linear(self.emb_size, 1024),
            nn.ReLU(True),
            nn.Linear(1024, 128),
            nn.Tanh()
        )
        self.dropout = nn.Dropout(drop_rate)
        initializer = nn.init.xavier_uniform_
        self.initial_user_emb = nn.Parameter(initializer(torch.empty(self.data.user_num, self.emb_size)))
        self.initial_item_emb = nn.Parameter(initializer(torch.empty(self.data.item_num, self.emb_size)))

    def forward(self, q, x):
        q_emb = self.initial_user_emb[q]
        i_emb = self.initial_item_emb[x]

        q_emb = self.user_tower(q_emb)
        i_emb = self.item_tower(i_emb)

        return q_emb, i_emb

    def item_encoding(self, x):
        i_emb = self.initial_item_emb[x]
        i1_emb = self.dropout(i_emb)
        i2_emb = self.dropout(i_emb)

        i1_emb = self.item_tower(i1_emb)
        i2_emb = self.item_tower(i2_emb)

        return i1_emb, i2_emb

    def cal_cl_loss(self, idx):
        item_view1, item_view_2 = self.item_encoding(idx)       
        cl_loss = InfoNCE(item_view1, item_view_2, self.tau)
        return cl_loss

In [None]:
model = 'SSL4Rec'
conf = ModelConf('/kaggle/working/sample/config.yaml')
rec = SELFRec(conf).execute()

In [None]:
rec.execute()

# SimGCL

In [9]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "./train.txt",
    "test.set": "./test.txt",
    "model":{
      "name": "SimGCL",
      "type": "graph"
    },

    "item.ranking.topN": [10, 20],

    "embedding.size": 1024,
    "max.epoch": 50,
    "batch.size": 2048,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,

    "SimGCL": {
      "n_layer": 3,
      "lambda": 0.5,
      "eps": 0.1
    },

    "output": "./results/"
}

# Save this dictionary as a YAML file
yaml_file_path = "config.yaml"
with open(yaml_file_path, "w") as file:
    yaml.dump(data, file, default_flow_style=False)

# Load the YAML file to verify content
with open(yaml_file_path, "r") as file:
    conf = yaml.safe_load(file)

print("Loaded data from YAML file:", conf)

Loaded data from YAML file: {'SimGCL': {'eps': 0.1, 'lambda': 0.5, 'n_layer': 3}, 'batch.size': 2048, 'embedding.size': 1024, 'item.ranking.topN': [10, 20], 'learning.rate': 0.001, 'max.epoch': 50, 'model': {'name': 'SimGCL', 'type': 'graph'}, 'output': './results/', 'reg.lambda': 0.0001, 'test.set': './test.txt', 'training.set': './train.txt'}


In [10]:
from data.loader import FileIO


class SELFRec(object):
    def __init__(self, config):
        self.social_data = []
        self.feature_data = []
        self.config = config
        self.training_data = FileIO.load_data_set(config['training.set'], config['model']['type'])
        self.test_data = FileIO.load_data_set(config['test.set'], config['model']['type'])

        self.kwargs = {}
#         if config.contain('social.data'):
#             social_data = FileIO.load_social_data(self.config['social.data'])
#             self.kwargs['social.data'] = social_data
        # if config.contains('feature.data'):
        #     self.social_data = FileIO.loadFeature(config,self.config['feature.data'])
        print('Reading data and preprocessing...')

    def execute(self):
        # import the model module
#         import_str = f"from model.{self.config['model']['type']}.{self.config['model']['name']} import {self.config['model']['name']}"
#         exec(import_str)
        recommender = f"{self.config['model']['name']}(self.config,self.training_data,self.test_data,**self.kwargs)"
        return eval(recommender)

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from base.graph_recommender import GraphRecommender
from util.sampler import next_batch_pairwise
from base.torch_interface import TorchGraphInterface
from util.loss_torch import bpr_loss, l2_reg_loss, InfoNCE

# Paper: Are graph augmentations necessary? simple graph contrastive learning for recommendation. SIGIR'22


class SimGCL(GraphRecommender):
    def __init__(self, conf, training_set, test_set):
        super(SimGCL, self).__init__(conf, training_set, test_set)
        args = self.config['SimGCL']
        self.cl_rate = float(args['lambda'])
        self.eps = float(args['eps'])
        self.n_layers = int(args['n_layer'])
        self.model = SimGCL_Encoder(self.data, self.emb_size, self.eps, self.n_layers)

    def train(self):
        model = self.model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lRate)
        for epoch in range(self.maxEpoch):
            for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)):
                user_idx, pos_idx, neg_idx = batch
                rec_user_emb, rec_item_emb = model()
                user_emb, pos_item_emb, neg_item_emb = rec_user_emb[user_idx], rec_item_emb[pos_idx], rec_item_emb[neg_idx]
                rec_loss = bpr_loss(user_emb, pos_item_emb, neg_item_emb)
                cl_loss = self.cl_rate * self.cal_cl_loss([user_idx,pos_idx])
                batch_loss =  rec_loss + l2_reg_loss(self.reg, user_emb, pos_item_emb) + cl_loss
                # Backward and optimize
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                if n % 100==0 and n>0:
                    print('training:', epoch + 1, 'batch', n, 'rec_loss:', rec_loss.item(), 'cl_loss', cl_loss.item())
            with torch.no_grad():
                self.user_emb, self.item_emb = self.model()
            self.fast_evaluation(epoch)
        self.user_emb, self.item_emb = self.best_user_emb, self.best_item_emb

    def cal_cl_loss(self, idx):
        u_idx = torch.unique(torch.Tensor(idx[0]).type(torch.long)).cuda()
        i_idx = torch.unique(torch.Tensor(idx[1]).type(torch.long)).cuda()
        user_view_1, item_view_1 = self.model(perturbed=True)
        user_view_2, item_view_2 = self.model(perturbed=True)
        user_cl_loss = InfoNCE(user_view_1[u_idx], user_view_2[u_idx], 0.2)
        item_cl_loss = InfoNCE(item_view_1[i_idx], item_view_2[i_idx], 0.2)
        return user_cl_loss + item_cl_loss

    def save(self):
        with torch.no_grad():
            self.best_user_emb, self.best_item_emb = self.model.forward()

    def predict(self, u):
        u = self.data.get_user_id(u)
        score = torch.matmul(self.user_emb[u], self.item_emb.transpose(0, 1))
        return score.cpu().numpy()


class SimGCL_Encoder(nn.Module):
    def __init__(self, data, emb_size, eps, n_layers):
        super(SimGCL_Encoder, self).__init__()
        self.data = data
        self.eps = eps
        self.emb_size = emb_size
        self.n_layers = n_layers
        self.norm_adj = data.norm_adj
        self.embedding_dict = self._init_model()
        self.sparse_norm_adj = TorchGraphInterface.convert_sparse_mat_to_tensor(self.norm_adj).cuda()

    def _init_model(self):
        initializer = nn.init.xavier_uniform_
        embedding_dict = nn.ParameterDict({
            'user_emb': nn.Parameter(initializer(torch.empty(self.data.user_num, self.emb_size))),
            'item_emb': nn.Parameter(initializer(torch.empty(self.data.item_num, self.emb_size))),
        })
        return embedding_dict

    def forward(self, perturbed=False):
        ego_embeddings = torch.cat([self.embedding_dict['user_emb'], self.embedding_dict['item_emb']], 0)
        all_embeddings = []
        for k in range(self.n_layers):
            ego_embeddings = torch.sparse.mm(self.sparse_norm_adj, ego_embeddings)
            if perturbed:
                random_noise = torch.rand_like(ego_embeddings).cuda()
                ego_embeddings += torch.sign(ego_embeddings) * F.normalize(random_noise, dim=-1) * self.eps
            all_embeddings.append(ego_embeddings)
        all_embeddings = torch.stack(all_embeddings, dim=1)
        all_embeddings = torch.mean(all_embeddings, dim=1)
        user_all_embeddings, item_all_embeddings = torch.split(all_embeddings, [self.data.user_num, self.data.item_num])
        return user_all_embeddings, item_all_embeddings

In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "0"

In [13]:
model = 'SimGCL'
conf = ModelConf('./config.yaml')
rec = SELFRec(conf).execute()

Reading data and preprocessing...


  return torch.sparse.FloatTensor(i, v, coo.shape)


In [14]:
rec.execute()

Model: SimGCL
Training Set: /workspace/viettel-ai-challenge-track-ds/SELFRec/train.txt
Test Set: /workspace/viettel-ai-challenge-track-ds/SELFRec/test.txt
Embedding Dimension: 1024
Maximum Epoch: 50
Learning Rate: 0.001
Batch Size: 2048
Regularization Parameter: 0.0001
Specific parameters: eps:0.1  lambda:0.5  n_layer:3
Training Set Size: (user number: 36751, item number: 82673, interaction number: 375372)
Test Set Size: (user number: 9370, item number: 9256, interaction number: 14551)
Initializing and building model...
Training Model...
training: 1 batch 100 rec_loss: 0.6677468419075012 cl_loss 2.6896183490753174
Evaluating the model...
Progress: [++++++++++++++++++++++++++++++++++++++++++++++++++]100%
--------------------------------------------------------------------------------
Real-Time Ranking Performance (Top-20 Item Recommendation)
*Current Performance*
Epoch: 1, Hit Ratio: 0.26118, Precision: 0.01966, Recall: 0.32661, NDCG: 0.16001
*Best Performance*
Epoch: 1, Hit Ratio: 0.26

KeyboardInterrupt: 

In [15]:
rec.save()

In [27]:
import pickle
with open(f"../runs/simgcl_final_data/simgcl.pkl", "wb") as f:
    pickle.dump(rec, f)

In [17]:
import pickle
emb_user = dict([(k, v) for k, v in zip(rec.data.user.keys(), rec.best_user_emb.cpu().numpy())])
with open(f"../runs/simgcl_final_data/user_embedding.pkl", "wb") as f:
    pickle.dump(emb_user, f)

In [18]:
import pickle
emb_item = dict([(k, v) for k, v in zip(rec.data.item.keys(), rec.best_item_emb.cpu().numpy())])
with open(f"../runs/simgcl_final_data/item_embedding.pkl", "wb") as f:
    pickle.dump(emb_item, f)

# DirectAU

In [14]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "./train.txt",
    "test.set": "./test.txt",
    "model":{
      "name": "DirectAU",
      "type": "graph"
    },

    "item.ranking.topN": [10, 20],

    "embedding.size": 1024,
    "max.epoch": 50,
    "batch.size": 1024,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,

    "DirectAU": {
      "n_layers": 2,
      "gamma": 2,
    },

    "output": "./results/"
}

# Save this dictionary as a YAML file
yaml_file_path = "config.yaml"
with open(yaml_file_path, "w") as file:
    yaml.dump(data, file, default_flow_style=False)

# Load the YAML file to verify content
with open(yaml_file_path, "r") as file:
    conf = yaml.safe_load(file)

print("Loaded data from YAML file:", conf)

Loaded data from YAML file: {'DirectAU': {'gamma': 2, 'n_layers': 2}, 'batch.size': 1024, 'embedding.size': 1024, 'item.ranking.topN': [10, 20], 'learning.rate': 0.001, 'max.epoch': 50, 'model': {'name': 'DirectAU', 'type': 'graph'}, 'output': './results/', 'reg.lambda': 0.0001, 'test.set': './test.txt', 'training.set': './train.txt'}


In [15]:
from data.loader import FileIO


class SELFRec(object):
    def __init__(self, config):
        self.social_data = []
        self.feature_data = []
        self.config = config
        self.training_data = FileIO.load_data_set(config['training.set'], config['model']['type'])
        self.test_data = FileIO.load_data_set(config['test.set'], config['model']['type'])

        self.kwargs = {}
#         if config.contain('social.data'):
#             social_data = FileIO.load_social_data(self.config['social.data'])
#             self.kwargs['social.data'] = social_data
        # if config.contains('feature.data'):
        #     self.social_data = FileIO.loadFeature(config,self.config['feature.data'])
        print('Reading data and preprocessing...')

    def execute(self):
        # import the model module
#         import_str = f"from model.{self.config['model']['type']}.{self.config['model']['name']} import {self.config['model']['name']}"
#         exec(import_str)
        recommender = f"{self.config['model']['name']}(self.config,self.training_data,self.test_data,**self.kwargs)"
        return eval(recommender)

In [16]:
import torch
import torch.nn.functional as F
from base.graph_recommender import GraphRecommender
from util.sampler import next_batch_pairwise
from util.loss_torch import bpr_loss,l2_reg_loss
from model.graph.MF import Matrix_Factorization
from model.graph.LightGCN import LGCN_Encoder

class DirectAU(GraphRecommender):
    def __init__(self, conf, training_set, test_set):
        super(DirectAU, self).__init__(conf, training_set, test_set)
        args = self.config['DirectAU']
        self.gamma = float(args['gamma'])
        self.n_layers= int(args['n_layers'])
        self.model = LGCN_Encoder(self.data, self.emb_size,self.n_layers)

    def train(self):
        model = self.model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lRate)
        for epoch in range(self.maxEpoch):
            for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)):
                user_idx, pos_idx, neg_idx = batch
                rec_user_emb, rec_item_emb = model()
                user_emb, pos_item_emb = rec_user_emb[user_idx], rec_item_emb[pos_idx]
                batch_loss = self.calculate_loss(user_emb, pos_item_emb)+ l2_reg_loss(self.reg, user_emb,pos_item_emb)/self.batch_size
                # Backward and optimize
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                if n % 100==0 and n>0:
                    print('training:', epoch + 1, 'batch', n, 'batch_loss:', batch_loss.item())
            with torch.no_grad():
                self.user_emb, self.item_emb = self.model()
            self.fast_evaluation(epoch)
        self.user_emb, self.item_emb = self.best_user_emb, self.best_item_emb

    def alignment(self,x, y):
        x, y = F.normalize(x, dim=-1), F.normalize(y, dim=-1)
        return (x - y).norm(p=2, dim=1).pow(2).mean()

    def uniformity(self,x, t=2):
        x = F.normalize(x, dim=-1)
        return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log()

    def calculate_loss(self,user_emb,item_emb):
        align = self.alignment(user_emb, item_emb)
        uniform = self.gamma * (self.uniformity(user_emb) + self.uniformity(item_emb)) / 2
        return align + uniform

    def save(self):
        with torch.no_grad():
            self.best_user_emb, self.best_item_emb = self.model.forward()

    def predict(self, u):
        with torch.no_grad():
            u = self.data.get_user_id(u)
            score = torch.matmul(self.user_emb[u], self.item_emb.transpose(0, 1))
            return score.cpu().numpy()

In [17]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "1"

In [18]:
model = 'DirectAU'
conf = ModelConf('./config.yaml')
rec = SELFRec(conf).execute()

Reading data and preprocessing...


In [None]:
rec.execute()

Model: DirectAU
Training Set: /workspace/viettel-ai-challenge-track-ds/SELFRec/train.txt
Test Set: /workspace/viettel-ai-challenge-track-ds/SELFRec/test.txt
Embedding Dimension: 1024
Maximum Epoch: 50
Learning Rate: 0.001
Batch Size: 1024
Regularization Parameter: 0.0001
Specific parameters: gamma:2  n_layers:2
Training Set Size: (user number: 36751, item number: 82673, interaction number: 375372)
Test Set Size: (user number: 9370, item number: 9256, interaction number: 14551)
Initializing and building model...
Training Model...
training: 1 batch 100 batch_loss: -7.018003940582275
training: 1 batch 200 batch_loss: -7.028926849365234
training: 1 batch 300 batch_loss: -7.029053688049316
Evaluating the model...
Progress: [++++++++++++++++++++++++++++++++++++++++++++++++++]100%
--------------------------------------------------------------------------------
Real-Time Ranking Performance (Top-20 Item Recommendation)
*Current Performance*
Epoch: 1, Hit Ratio: 0.25721, Precision: 0.01936, Rec

In [20]:
rec.save()

In [22]:
import pickle
with open(f"../runs/directau_private/model.pkl", "wb") as f:
    pickle.dump(rec, f)

In [23]:
import pickle
emb_user = dict([(k, v) for k, v in zip(rec.data.user.keys(), rec.best_user_emb.cpu().numpy())])
with open(f"../runs/directau_private/user_embedding.pkl", "wb") as f:
    pickle.dump(emb_user, f)

In [24]:
import pickle
emb_item = dict([(k, v) for k, v in zip(rec.data.item.keys(), rec.best_item_emb.cpu().numpy())])
with open(f"../runs/directau_private/item_embedding.pkl", "wb") as f:
    pickle.dump(emb_item, f)