In [None]:
%cd SELFRec/

# Import Lib

In [None]:
import os

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from base.torch_interface import TorchGraphInterface
from base.graph_recommender import GraphRecommender
from data.loader import FileIO
from util.conf import ModelConf
from util.sampler import next_batch_pairwise
from util.loss_torch import bpr_loss, l2_reg_loss, InfoNCE

from model.graph.LightGCN import *
from model.graph.XSimGCL import *
from model.graph.DirectAU import *
from model.graph.SimGCL import *
from SELFRec import SELFRec

In [None]:
def save_config(out_path):
    # Save this dictionary as a YAML file
    yaml_file_path = f"{out_path}/config.yaml"
    with open(yaml_file_path, "w") as file:
        yaml.dump(data, file, default_flow_style=False)
    
    # Load the YAML file to verify content
    with open(yaml_file_path, "r") as file:
        conf = yaml.safe_load(file)
    
    print("Loaded data from YAML file:", conf)

In [None]:
checkpoint_path="../runs/"

# Preprocess Data

In [None]:
df = pd.read_csv('../data/training_set.csv')
predict_df = pd.read_csv('../data/public_testset.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])

test_user_id = predict_df['user_id'].values

item_columns = predict_df.columns[1:]  # Lấy tất cả cột trừ cột uid
item_in_test_df = pd.unique(predict_df[item_columns].values.ravel())
#Lọc bỏ user_id bị lẫn trong tập test
user_list = df.UserId.unique()
item_in_test_df = list(set(item_in_test_df).difference(set(user_list)))

test_df = df[~df.ItemId.isin(item_in_test_df)].sample(frac=0.05, random_state=42)  # 90% for train
# train_df = df
train_df = df.drop(test_df.index).groupby('UserId').tail(20)  # Remaining 10% for test

train_df.shape, test_df.shape

In [None]:
# Save train and test sets as .txt files without headers, separated by a space
train_df.to_csv("../data/train.txt", index=False, header=False, sep=" ")
test_df.to_csv("../data/test.txt", index=False, header=False, sep=" ")

In [None]:
df

# XSimGCL

In [None]:
model = "XSimGCL"
model_path=f"{checkpoint_path}/{model}"
os.makedirs(checkpoint_path, exist_ok=True)

In [None]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "../data/train.txt",
    "test.set": "../data/test.txt",
    "model": {
        "name": "XSimGCL",
        "type": "graph"
    },
    "item.ranking.topN": [10, 20],
    "embedding.size": 1024,
    "max.epoch": 1,
    "batch.size": 2048,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,
    "XSimGCL": {
        "n_layer": 2,
        "l_star": 1,
        "lambda": 0.2,
        "eps": 0.2,
        "tau": 0.15
    },
    "output": model_path
}

save_config(checkpoint_path)

In [None]:
conf = ModelConf(f'{model_path}/config.yaml')
rec = SELFRec(conf)
rec.execute()
rec.save()

import pickle
with open(f"{model_path}/model.pkl", "wb") as f:
    pickle.dump(rec, f)

emb_user = dict([(k, v) for k, v in zip(rec.data.user.keys(), rec.best_user_emb.cpu().numpy())])
with open(f"{model_path}/user_embedding.pkl", "wb") as f:
    pickle.dump(emb_user, f)

emb_item = dict([(k, v) for k, v in zip(rec.data.item.keys(), rec.best_item_emb.cpu().numpy())])
with open(f"{model_path}/item_embedding.pkl", "wb") as f:
    pickle.dump(emb_item, f)

# LightGCN

In [None]:
model = "LightGCN"
model_path=f"{checkpoint_path}/{model}"
os.makedirs(checkpoint_path, exist_ok=True)

In [None]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "../data/train.txt",
    "test.set": "../data/test.txt",
    "model": {
        "name": "LightGCN",
        "type": "graph"
    },
    "item.ranking.topN": [10, 20],
    "embedding.size": 1024,
    "max.epoch": 100,
    "batch.size": 2048,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,
    "LightGCN": {
        "n_layer": 2
    },
    "output": checkpoint_path
}

save_config(checkpoint_path)

In [None]:
conf = ModelConf(f'{model_path}/config.yaml')
rec = SELFRec(conf)
rec.execute()
rec.save()

import pickle
with open(f"{model_path}/model.pkl", "wb") as f:
    pickle.dump(rec, f)

emb_user = dict([(k, v) for k, v in zip(rec.data.user.keys(), rec.best_user_emb.cpu().numpy())])
with open(f"{model_path}/user_embedding.pkl", "wb") as f:
    pickle.dump(emb_user, f)

emb_item = dict([(k, v) for k, v in zip(rec.data.item.keys(), rec.best_item_emb.cpu().numpy())])
with open(f"{model_path}/item_embedding.pkl", "wb") as f:
    pickle.dump(emb_item, f)

In [None]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "/kaggle/working/sample/train.txt",
    "test.set": "/kaggle/working/sample/test.txt",
    "model":{
      "name": "SSL4Rec",
      "type": "graph"
    },

    "item.ranking.topN": [20],

    "embedding.size": 256,
    "max.epoch": 100,
    "batch.size": 2048,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,

    "SSL4Rec":{
      "tau": 0.07,
      "alpha": 0.1,
      "drop": 0.1
    },

    "output": "./results/"
}

# Save this dictionary as a YAML file
yaml_file_path = "config.yaml"
with open(yaml_file_path, "w") as file:
    yaml.dump(data, file, default_flow_style=False)

# Load the YAML file to verify content
with open(yaml_file_path, "r") as file:
    conf = yaml.safe_load(file)

print("Loaded data from YAML file:", conf)

In [None]:
from data.loader import FileIO


class SELFRec(object):
    def __init__(self, config):
        self.social_data = []
        self.feature_data = []
        self.config = config
        self.training_data = FileIO.load_data_set(config['training.set'], config['model']['type'])
        self.test_data = FileIO.load_data_set(config['test.set'], config['model']['type'])

        self.kwargs = {}
#         if config.contain('social.data'):
#             social_data = FileIO.load_social_data(self.config['social.data'])
#             self.kwargs['social.data'] = social_data
        # if config.contains('feature.data'):
        #     self.social_data = FileIO.loadFeature(config,self.config['feature.data'])
        print('Reading data and preprocessing...')

    def execute(self):
        # import the model module
#         import_str = f"from model.{self.config['model']['type']}.{self.config['model']['name']} import {self.config['model']['name']}"
#         exec(import_str)
        recommender = f"{self.config['model']['name']}(self.config,self.training_data,self.test_data,**self.kwargs)"
        return eval(recommender)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from base.graph_recommender import GraphRecommender
from util.sampler import next_batch_pairwise
from util.loss_torch import l2_reg_loss, InfoNCE, batch_softmax_loss

# Paper: Self-supervised Learning for Large-scale Item Recommendations. CIKM'21

""" 
Note: This version of code conducts feature dropout on the item embeddings 
because items features are not always available in many academic datasets.
"""


class SSL4Rec(GraphRecommender):
    def __init__(self, conf, training_set, test_set):
        super(SSL4Rec, self).__init__(conf, training_set, test_set)
        args = self.config['SSL4Rec']
        self.cl_rate = float(args['alpha'])
        self.tau = float(args['tau'])
        self.drop_rate = float(args['drop'])
        self.model = DNN_Encoder(self.data, self.emb_size, self.drop_rate, self.tau)

    def train(self):
        model = self.model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=self.lRate)
        for epoch in range(self.maxEpoch):
            for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)):
                query_idx, item_idx, _neg = batch
                model.train()
                query_emb, item_emb = model(query_idx, item_idx)
                rec_loss = batch_softmax_loss(query_emb, item_emb, self.tau)
                cl_loss = self.cl_rate * model.cal_cl_loss(item_idx)
                batch_loss = rec_loss + l2_reg_loss(self.reg, query_emb, item_emb) + cl_loss
                # Backward and optimize
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                if n % 100 == 0:
                    print('training:', epoch + 1, 'batch', n, 'rec_loss:', rec_loss.item(), 'cl_loss', cl_loss.item())
            model.eval()
            with torch.no_grad():
                self.query_emb, self.item_emb = self.model(list(range(self.data.user_num)),list(range(self.data.item_num)))
            self.fast_evaluation(epoch)
        self.query_emb, self.item_emb = self.best_query_emb, self.best_item_emb

    def save(self):
        with torch.no_grad():
            self.best_query_emb, self.best_item_emb = self.model.forward(list(range(self.data.user_num)),list(range(self.data.item_num)))

    def predict(self, u):
        u = self.data.get_user_id(u)
        score = torch.matmul(self.query_emb[u], self.item_emb.transpose(0, 1))
        return score.cpu().numpy()


class DNN_Encoder(nn.Module):
    def __init__(self, data, emb_size, drop_rate, temperature):
        super(DNN_Encoder, self).__init__()
        self.data = data
        self.emb_size = emb_size
        self.tau = temperature
        self.user_tower = nn.Sequential(
            nn.Linear(self.emb_size, 1024),
            nn.ReLU(True),
            nn.Linear(1024, 128),
            nn.Tanh()
        )
        self.item_tower = nn.Sequential(
            nn.Linear(self.emb_size, 1024),
            nn.ReLU(True),
            nn.Linear(1024, 128),
            nn.Tanh()
        )
        self.dropout = nn.Dropout(drop_rate)
        initializer = nn.init.xavier_uniform_
        self.initial_user_emb = nn.Parameter(initializer(torch.empty(self.data.user_num, self.emb_size)))
        self.initial_item_emb = nn.Parameter(initializer(torch.empty(self.data.item_num, self.emb_size)))

    def forward(self, q, x):
        q_emb = self.initial_user_emb[q]
        i_emb = self.initial_item_emb[x]

        q_emb = self.user_tower(q_emb)
        i_emb = self.item_tower(i_emb)

        return q_emb, i_emb

    def item_encoding(self, x):
        i_emb = self.initial_item_emb[x]
        i1_emb = self.dropout(i_emb)
        i2_emb = self.dropout(i_emb)

        i1_emb = self.item_tower(i1_emb)
        i2_emb = self.item_tower(i2_emb)

        return i1_emb, i2_emb

    def cal_cl_loss(self, idx):
        item_view1, item_view_2 = self.item_encoding(idx)       
        cl_loss = InfoNCE(item_view1, item_view_2, self.tau)
        return cl_loss

In [None]:
model = 'SSL4Rec'
conf = ModelConf('/kaggle/working/sample/config.yaml')
rec = SELFRec(conf).execute()

In [None]:
rec.execute()

# SimGCL

In [None]:
model = "SimGCL"
model_path=f"{checkpoint_path}/{model}"
os.makedirs(checkpoint_path, exist_ok=True)

In [None]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "./train.txt",
    "test.set": "./test.txt",
    "model":{
      "name": "SimGCL",
      "type": "graph"
    },

    "item.ranking.topN": [10, 20],

    "embedding.size": 1024,
    "max.epoch": 50,
    "batch.size": 2048,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,

    "SimGCL": {
      "n_layer": 3,
      "lambda": 0.5,
      "eps": 0.1
    },

    "output": model_path
}

save_config(checkpoint_path)

In [None]:
conf = ModelConf(f'{model_path}/config.yaml')
rec = SELFRec(conf)
rec.execute()
rec.save()

import pickle
with open(f"{model_path}/model.pkl", "wb") as f:
    pickle.dump(rec, f)

emb_user = dict([(k, v) for k, v in zip(rec.data.user.keys(), rec.best_user_emb.cpu().numpy())])
with open(f"{model_path}/user_embedding.pkl", "wb") as f:
    pickle.dump(emb_user, f)

emb_item = dict([(k, v) for k, v in zip(rec.data.item.keys(), rec.best_item_emb.cpu().numpy())])
with open(f"{model_path}/item_embedding.pkl", "wb") as f:
    pickle.dump(emb_item, f)

# DirectAU

In [None]:
model = "DirectAU"
model_path=f"{checkpoint_path}/{model}"
os.makedirs(checkpoint_path, exist_ok=True)

In [None]:
import yaml

# Define the data as a Python dictionary
data = {
    "training.set": "./train.txt",
    "test.set": "./test.txt",
    "model":{
      "name": "DirectAU",
      "type": "graph"
    },

    "item.ranking.topN": [10, 20],

    "embedding.size": 1024,
    "max.epoch": 50,
    "batch.size": 1024,
    "learning.rate": 0.001,
    "reg.lambda": 0.0001,

    "DirectAU": {
      "n_layers": 2,
      "gamma": 2,
    },

    "output": model_path
}

save_config(checkpoint_path)

In [None]:
conf = ModelConf(f'{model_path}/config.yaml')
rec = SELFRec(conf)
rec.execute()
rec.save()

import pickle
with open(f"{model_path}/model.pkl", "wb") as f:
    pickle.dump(rec, f)

emb_user = dict([(k, v) for k, v in zip(rec.data.user.keys(), rec.best_user_emb.cpu().numpy())])
with open(f"{model_path}/user_embedding.pkl", "wb") as f:
    pickle.dump(emb_user, f)

emb_item = dict([(k, v) for k, v in zip(rec.data.item.keys(), rec.best_item_emb.cpu().numpy())])
with open(f"{model_path}/item_embedding.pkl", "wb") as f:
    pickle.dump(emb_item, f)