In [1]:
# changing core directory
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)
os.chdir('..')

In [2]:
import zipfile
import os
from collections import defaultdict
import random

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import ndcg_score
import lightgbm as lgb
import requests
from tqdm import tqdm

from src.data import load_data, download_movielens1m, ValSASRecDataset, TrainSASRecDataset

# CONFIG

In [3]:
seed = 0
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [4]:
def fix_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    
fix_seed(seed)


# Подгрузка данных

In [5]:
# # Скачивание и распаковка датасета
# def download_movielens1m():
#     url = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
#     if not os.path.exists("ml-1m"):
#         print("Downloading MovieLens 1M dataset...")
#         response = requests.get(url, stream=True)
#         with open("ml-1m.zip", "wb") as f:
#             for chunk in response.iter_content(chunk_size=1024):
#                 if chunk:
#                     f.write(chunk)

#         with zipfile.ZipFile("ml-1m.zip", 'r') as zip_ref:
#             zip_ref.extractall(".")
#         print("Dataset downloaded and extracted.")
#     else:
#         print("Dataset already exists.")

# # Загрузка данных
# def load_data():
#     ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python',
#                          names=['user_id', 'item_id', 'rating', 'timestamp'])
#     ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
#     return ratings

In [6]:
# Скачиваем и загружаем данные
download_movielens1m()
ratings = load_data()

ratings = ratings[ratings["rating"] > 3.5]

# Подготовка данных
num_users = ratings['user_id'].nunique()
num_items = ratings['item_id'].nunique()

user2id = {val:i for i, val in enumerate(ratings['user_id'].unique())}
item2id = {val:i+1 for i, val in enumerate(ratings['item_id'].unique())}

ratings['user_id'] = ratings['user_id'].map(user2id)
ratings['item_id'] = ratings['item_id'].map(item2id)

Dataset already exists.


In [None]:
# Создаем последовательности
sequences = []
times = []
for user_id, group in ratings.groupby('user_id'):
    group = group.sort_values('timestamp')
    user_seq = group['item_id'].tolist()
    user_times = group['timestamp'].astype(int).tolist()
    sequences.append(user_seq)
    times.append(user_times)

# Разделяем на train/val
split_idx = int(0.8 * len(sequences))
train_sequences = sequences[:split_idx]
val_sequences = sequences[split_idx:]

train_times = times[:split_idx]
val_times = times[split_idx:]

all_users = ratings['user_id'].unique()
train_users, val_users = all_users[:split_idx], all_users[split_idx:]
print(len(train_users), len(val_users))



4830 1208


# Обучение нейронки

In [8]:
# Создание датасета для SASRec

# class TrainSASRecDataset(Dataset):
#     def __init__(self, sequences, times, max_len=50):
#         self.sequences = sequences
#         self.times = times
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.sequences)

#     def __getitem__(self, idx):
#         seq = self.sequences[idx]
#         time = self.times[idx]
#         if len(seq) > self.max_len:
#             seq = seq[-self.max_len:]
#             time = time[-self.max_len:]
#         else:
#             seq = [0] * (self.max_len - len(seq)) + seq
#             time = [0] * (self.max_len - len(time)) + time
#         input_seq = torch.tensor(seq[:-1], dtype=torch.long)
#         input_times = torch.tensor(time[:-1], dtype=torch.long)
#         target = torch.tensor(seq[1:], dtype=torch.long)
#         return (input_seq, input_times), target


# class ValSASRecDataset(Dataset):
#     def __init__(self, sequences, times, max_len=50):
#         self.sequences = sequences
#         self.times = times
#         self.max_len = max_len

#     def __len__(self):
#         return len(self.sequences)

#     def __getitem__(self, idx):
#         seq = self.sequences[idx]
#         time = self.times[idx]
#         if len(seq) > self.max_len:
#             seq = seq[-self.max_len:]
#             time = time[-self.max_len:]
#         else:
#             seq = [0] * (self.max_len - len(seq)) + seq
#             time = [0] * (self.max_len - len(time)) + time
#         input_seq = torch.tensor(seq[:-1], dtype=torch.long)
#         input_times = torch.tensor(time[:-1], dtype=torch.long)
#         target = torch.tensor(seq[-1], dtype=torch.long)

#         return (input_seq, input_times), target


max_len = 200
train_dataset = TrainSASRecDataset(train_sequences, train_times, max_len)
val_dataset = ValSASRecDataset(val_sequences, val_times, max_len)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

In [9]:
# Модель SASRec на PyTorch
class SASRec(nn.Module):
    def __init__(self,
        num_items,
        hidden_units=64,
        num_heads=2,
        num_blocks=2,
        dropout_rate=0.2,
        max_len=200,
        ext_flag=False
    ):
        super(SASRec, self).__init__()
        self.num_items = num_items
        self.hidden_units = hidden_units
        self.max_len = max_len
        self.ext_flag = ext_flag

        self.item_emb = nn.Embedding(num_items + 1, hidden_units, padding_idx=0)
        self.pos_emb = nn.Embedding(max_len, hidden_units)
        self.dropout = nn.Dropout(dropout_rate)

        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=hidden_units,
                nhead=num_heads,
                dim_feedforward=hidden_units,
                dropout=dropout_rate,
                batch_first=True
            ) for _ in range(num_blocks)
        ])

        self.layer_norm = nn.LayerNorm(hidden_units)
        self.output_layer = nn.Linear(hidden_units, num_items + 1)

    def forward(self, input_seqs, timestamps=None):
        batch_size, seq_len = input_seqs.size()

        # Position encoding
        positions = torch.arange(seq_len, dtype=torch.long, device=input_seqs.device)
        positions = positions.unsqueeze(0).expand(batch_size, seq_len)

        # Item and position embedding
        item_emb = self.item_emb(input_seqs)
        pos_emb = self.pos_emb(positions)
        x = item_emb + pos_emb
        x = self.dropout(x)

        # Transformer encoder
        mask = self.generate_square_subsequent_mask(seq_len).to(input_seqs.device)
        for layer in self.encoder_layers:
            x = layer(x, mask)

        x = self.layer_norm(x)

        if self.ext_flag:
            ext_context = self.get_external_features(timestamps).to(x.device)
            extended_data = torch.cat([x, ext_context], dim=2)
            output = self.ext_head(extended_data)
        else:
            output = self.output_layer(x)

        return output

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def get_embeddings(self, input_seqs):
        batch_size, seq_len = input_seqs.size()

        positions = torch.arange(seq_len, dtype=torch.long, device=input_seqs.device)
        positions = positions.unsqueeze(0).expand(batch_size, seq_len)

        item_emb = self.item_emb(input_seqs)
        pos_emb = self.pos_emb(positions)
        x = item_emb + pos_emb

        mask = self.generate_square_subsequent_mask(seq_len).to(input_seqs.device)
        for layer in self.encoder_layers:
            x = layer(x, mask)

        x = self.layer_norm(x)
        return x

    def freeze(self):
        for param in self.parameters():
            param.requires_grad = False
        print("Все слои сети заморожены.")

    def add_external_features(self, ext_features):
        self.ext_flag = True
        self.freeze()
        self.ext_head = nn.Linear(self.hidden_units*2, num_items + 1)
        self.time_list = ext_features[0]
        self.ext_embeddings = ext_features[1]

    def get_external_features(self, timestamps):
        bs, seq_len = timestamps.shape
        timestamps = timestamps.reshape(-1).cpu().detach().numpy()

        ext_ids = np.searchsorted(self.time_list, timestamps, side='right') - 1
        ext_context = self.ext_embeddings[ext_ids]
        ext_context = torch.tensor(ext_context, dtype=torch.float32).reshape(bs, seq_len, -1)
        return ext_context

    # ----------------------------------------
    # def add_external_features(self, ext_features):
    #     self.ext_flag = True
    #     self.freeze()
    #     self.ext_head = nn.Linear(self.hidden_units*2, num_items + 1)
    #     self.ext_features = ext_features
    #     self.ext_features['timestamp'] = self.ext_features['timestamp'].astype(int)

    # def get_external_features(self, timestamps):
    #     bs, seq_len = timestamps.shape
    #     timestamps = timestamps.reshape(-1)
    #     ext_context = []
    #     for time in timestamps:
    #         ext_context.append(self.get_one_ext_represent(time))

    #     ext_context = torch.tensor(ext_context, dtype=torch.float32)
    #     ext_context = ext_context.reshape(bs, seq_len, -1)
    #     return ext_context

    # def get_one_ext_represent(self, time):
    #     context_info = self.ext_features[self.ext_features['timestamp'] <= time.cpu().detach().numpy()]
    #     if len(context_info) > 0:
    #         context_info = context_info.iloc[-1]
    #         mean_context = context_info['mean_embedding']
    #         return mean_context
    #     else:
    #         return np.zeros(self.hidden_units)


In [10]:
def calculate_ndcg(model, dataloader, device, k=10):
    model.eval()

    # Вместо накопления всех предсказаний, будем вычислять NDCG по батчам
    ndcg_scores = []

    with torch.no_grad():
        for (input_seqs, input_times), targets in dataloader:
            input_seqs, input_times, targets = input_seqs.to(device), input_times.to(device), targets.to(device)
            outputs = model(input_seqs, input_times)
            last_outputs = outputs[:, -1, :]

            # Преобразуем в numpy
            predictions = last_outputs.cpu().numpy()
            targets_np = targets.cpu().numpy()

            # Создаем матрицу релевантности только для текущего батча
            n_items = predictions.shape[1]
            relevance = np.zeros((len(targets_np), n_items))
            relevance[np.arange(len(targets_np)), targets_np] = 1

            # Вычисляем NDCG для текущего батча
            try:
                batch_ndcg = ndcg_score(relevance, predictions, k=k)
                ndcg_scores.append(batch_ndcg)
            except MemoryError:
                # Если батч слишком большой, разбиваем его на подбатчи
                sub_batch_size = 100
                for i in range(0, len(targets_np), sub_batch_size):
                    end_idx = min(i + sub_batch_size, len(targets_np))
                    sub_relevance = relevance[i:end_idx]
                    sub_predictions = predictions[i:end_idx]
                    sub_ndcg = ndcg_score(sub_relevance, sub_predictions, k=k)
                    ndcg_scores.append(sub_ndcg)

    # Возвращаем среднее значение NDCG по всем батчам
    return np.mean(ndcg_scores)

# Обучение модели SASRec
def train_sasrec(model, train_loader, val_loader, optimizer, criterion, device, epochs=10):
    model.to(device)
    best_ndcg = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch_idx, ((input_seqs, input_times), targets) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
            input_seqs, input_times, targets = input_seqs.to(device), input_times.to(device), targets.to(device)

            optimizer.zero_grad()
            outputs = model(input_seqs, input_times)

            # Получаем предсказания для последнего элемента в последовательности
            # last_outputs = outputs[:, -1, :]
            #loss = criterion(last_outputs, targets)
            outputs, targets = outputs.reshape(-1, num_items+1), targets.reshape(-1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Валидация
        val_ndcg = calculate_ndcg(model, val_loader, device)
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, Val NDCG@{10}: {val_ndcg:.4f}")

        # Сохраняем лучшую модель
        if val_ndcg > best_ndcg:
            best_ndcg = val_ndcg
            torch.save(model.state_dict(), "best_sasrec_model.pth")

    return model

In [11]:
# Инициализируем модель
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SASRec(num_items, max_len=max_len).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# # Обучаем модель
model = train_sasrec(model, train_loader, val_loader, optimizer, criterion, device, epochs=70)

Epoch 1/70: 100%|██████████| 38/38 [00:10<00:00,  3.64it/s]


Epoch 1, Loss: 5.3168, Val NDCG@10: 0.0066


Epoch 2/70: 100%|██████████| 38/38 [00:10<00:00,  3.50it/s]


Epoch 2, Loss: 3.6244, Val NDCG@10: 0.0076


Epoch 3/70: 100%|██████████| 38/38 [00:13<00:00,  2.76it/s]


Epoch 3, Loss: 3.4732, Val NDCG@10: 0.0070


Epoch 4/70: 100%|██████████| 38/38 [00:11<00:00,  3.45it/s]


Epoch 4, Loss: 3.1314, Val NDCG@10: 0.0095


Epoch 5/70: 100%|██████████| 38/38 [00:13<00:00,  2.71it/s]


Epoch 5, Loss: 3.0233, Val NDCG@10: 0.0120


Epoch 6/70: 100%|██████████| 38/38 [00:16<00:00,  2.24it/s]


Epoch 6, Loss: 2.9959, Val NDCG@10: 0.0131


Epoch 7/70: 100%|██████████| 38/38 [00:09<00:00,  4.19it/s]


Epoch 7, Loss: 2.9791, Val NDCG@10: 0.0124


Epoch 8/70: 100%|██████████| 38/38 [00:14<00:00,  2.68it/s]


Epoch 8, Loss: 2.9688, Val NDCG@10: 0.0105


Epoch 9/70: 100%|██████████| 38/38 [00:13<00:00,  2.76it/s]


Epoch 9, Loss: 2.9562, Val NDCG@10: 0.0110


Epoch 10/70: 100%|██████████| 38/38 [00:12<00:00,  3.06it/s]


Epoch 10, Loss: 2.9502, Val NDCG@10: 0.0112


Epoch 11/70: 100%|██████████| 38/38 [00:12<00:00,  3.09it/s]


Epoch 11, Loss: 2.9421, Val NDCG@10: 0.0105


Epoch 12/70: 100%|██████████| 38/38 [00:06<00:00,  5.76it/s]


Epoch 12, Loss: 2.9353, Val NDCG@10: 0.0130


Epoch 13/70: 100%|██████████| 38/38 [00:11<00:00,  3.26it/s]


Epoch 13, Loss: 2.9302, Val NDCG@10: 0.0131


Epoch 14/70: 100%|██████████| 38/38 [00:14<00:00,  2.54it/s]


Epoch 14, Loss: 2.9271, Val NDCG@10: 0.0121


Epoch 15/70: 100%|██████████| 38/38 [00:13<00:00,  2.90it/s]


Epoch 15, Loss: 2.9223, Val NDCG@10: 0.0141


Epoch 16/70: 100%|██████████| 38/38 [00:12<00:00,  3.12it/s]


Epoch 16, Loss: 2.9182, Val NDCG@10: 0.0128


Epoch 17/70: 100%|██████████| 38/38 [00:12<00:00,  3.00it/s]


Epoch 17, Loss: 2.9158, Val NDCG@10: 0.0132


Epoch 18/70: 100%|██████████| 38/38 [00:16<00:00,  2.29it/s]


Epoch 18, Loss: 2.9088, Val NDCG@10: 0.0146


Epoch 19/70: 100%|██████████| 38/38 [00:14<00:00,  2.71it/s]


Epoch 19, Loss: 2.9034, Val NDCG@10: 0.0163


Epoch 20/70: 100%|██████████| 38/38 [00:13<00:00,  2.84it/s]


Epoch 20, Loss: 2.8967, Val NDCG@10: 0.0156


Epoch 21/70: 100%|██████████| 38/38 [00:07<00:00,  4.82it/s]


Epoch 21, Loss: 2.8831, Val NDCG@10: 0.0180


Epoch 22/70: 100%|██████████| 38/38 [00:16<00:00,  2.29it/s]


Epoch 22, Loss: 2.8715, Val NDCG@10: 0.0219


Epoch 23/70: 100%|██████████| 38/38 [00:10<00:00,  3.70it/s]


Epoch 23, Loss: 2.8528, Val NDCG@10: 0.0249


Epoch 24/70: 100%|██████████| 38/38 [00:11<00:00,  3.34it/s]


Epoch 24, Loss: 2.8341, Val NDCG@10: 0.0250


Epoch 25/70: 100%|██████████| 38/38 [00:13<00:00,  2.82it/s]


Epoch 25, Loss: 2.8173, Val NDCG@10: 0.0258


Epoch 26/70: 100%|██████████| 38/38 [00:08<00:00,  4.54it/s]


Epoch 26, Loss: 2.7908, Val NDCG@10: 0.0277


Epoch 27/70: 100%|██████████| 38/38 [00:13<00:00,  2.92it/s]


Epoch 27, Loss: 2.7704, Val NDCG@10: 0.0317


Epoch 28/70: 100%|██████████| 38/38 [00:15<00:00,  2.40it/s]


Epoch 28, Loss: 2.7538, Val NDCG@10: 0.0323


Epoch 29/70: 100%|██████████| 38/38 [00:09<00:00,  4.11it/s]


Epoch 29, Loss: 2.7327, Val NDCG@10: 0.0336


Epoch 30/70: 100%|██████████| 38/38 [00:17<00:00,  2.23it/s]


Epoch 30, Loss: 2.7196, Val NDCG@10: 0.0353


Epoch 31/70: 100%|██████████| 38/38 [00:15<00:00,  2.41it/s]


Epoch 31, Loss: 2.7061, Val NDCG@10: 0.0380


Epoch 32/70: 100%|██████████| 38/38 [00:15<00:00,  2.43it/s]


Epoch 32, Loss: 2.6877, Val NDCG@10: 0.0409


Epoch 33/70: 100%|██████████| 38/38 [00:13<00:00,  2.77it/s]


Epoch 33, Loss: 2.6747, Val NDCG@10: 0.0396


Epoch 34/70: 100%|██████████| 38/38 [00:11<00:00,  3.42it/s]


Epoch 34, Loss: 2.6574, Val NDCG@10: 0.0419


Epoch 35/70: 100%|██████████| 38/38 [00:11<00:00,  3.42it/s]


Epoch 35, Loss: 2.6473, Val NDCG@10: 0.0404


Epoch 36/70: 100%|██████████| 38/38 [00:19<00:00,  1.97it/s]


Epoch 36, Loss: 2.6302, Val NDCG@10: 0.0423


Epoch 37/70: 100%|██████████| 38/38 [00:14<00:00,  2.66it/s]


Epoch 37, Loss: 2.6192, Val NDCG@10: 0.0436


Epoch 38/70: 100%|██████████| 38/38 [00:13<00:00,  2.74it/s]


Epoch 38, Loss: 2.6050, Val NDCG@10: 0.0439


Epoch 39/70: 100%|██████████| 38/38 [00:19<00:00,  1.97it/s]


Epoch 39, Loss: 2.5920, Val NDCG@10: 0.0461


Epoch 40/70: 100%|██████████| 38/38 [00:09<00:00,  3.85it/s]


Epoch 40, Loss: 2.5782, Val NDCG@10: 0.0460


Epoch 41/70: 100%|██████████| 38/38 [00:16<00:00,  2.30it/s]


Epoch 41, Loss: 2.5678, Val NDCG@10: 0.0487


Epoch 42/70: 100%|██████████| 38/38 [00:18<00:00,  2.06it/s]


Epoch 42, Loss: 2.5580, Val NDCG@10: 0.0495


Epoch 43/70: 100%|██████████| 38/38 [00:16<00:00,  2.26it/s]


Epoch 43, Loss: 2.5451, Val NDCG@10: 0.0513


Epoch 44/70: 100%|██████████| 38/38 [00:07<00:00,  5.13it/s]


Epoch 44, Loss: 2.5347, Val NDCG@10: 0.0522


Epoch 45/70: 100%|██████████| 38/38 [00:12<00:00,  3.09it/s]


Epoch 45, Loss: 2.5252, Val NDCG@10: 0.0544


Epoch 46/70: 100%|██████████| 38/38 [00:14<00:00,  2.57it/s]


Epoch 46, Loss: 2.5153, Val NDCG@10: 0.0551


Epoch 47/70: 100%|██████████| 38/38 [00:08<00:00,  4.51it/s]


Epoch 47, Loss: 2.5049, Val NDCG@10: 0.0544


Epoch 48/70: 100%|██████████| 38/38 [00:16<00:00,  2.25it/s]


Epoch 48, Loss: 2.4983, Val NDCG@10: 0.0561


Epoch 49/70: 100%|██████████| 38/38 [00:19<00:00,  1.93it/s]


Epoch 49, Loss: 2.4902, Val NDCG@10: 0.0567


Epoch 50/70: 100%|██████████| 38/38 [00:16<00:00,  2.35it/s]


Epoch 50, Loss: 2.4813, Val NDCG@10: 0.0551


Epoch 51/70: 100%|██████████| 38/38 [00:15<00:00,  2.38it/s]


Epoch 51, Loss: 2.4742, Val NDCG@10: 0.0573


Epoch 52/70: 100%|██████████| 38/38 [00:10<00:00,  3.49it/s]


Epoch 52, Loss: 2.4689, Val NDCG@10: 0.0595


Epoch 53/70: 100%|██████████| 38/38 [00:14<00:00,  2.66it/s]


Epoch 53, Loss: 2.4615, Val NDCG@10: 0.0610


Epoch 54/70: 100%|██████████| 38/38 [00:11<00:00,  3.24it/s]


Epoch 54, Loss: 2.4574, Val NDCG@10: 0.0607


Epoch 55/70: 100%|██████████| 38/38 [00:12<00:00,  2.95it/s]


Epoch 55, Loss: 2.4512, Val NDCG@10: 0.0620


Epoch 56/70: 100%|██████████| 38/38 [00:12<00:00,  3.15it/s]


Epoch 56, Loss: 2.4449, Val NDCG@10: 0.0614


Epoch 57/70: 100%|██████████| 38/38 [00:15<00:00,  2.44it/s]


Epoch 57, Loss: 2.4374, Val NDCG@10: 0.0627


Epoch 58/70: 100%|██████████| 38/38 [00:18<00:00,  2.05it/s]


Epoch 58, Loss: 2.4340, Val NDCG@10: 0.0642


Epoch 59/70: 100%|██████████| 38/38 [00:01<00:00, 31.12it/s]


Epoch 59, Loss: 2.4297, Val NDCG@10: 0.0624


Epoch 60/70: 100%|██████████| 38/38 [00:14<00:00,  2.59it/s]


Epoch 60, Loss: 2.4210, Val NDCG@10: 0.0634


Epoch 61/70: 100%|██████████| 38/38 [00:11<00:00,  3.39it/s]


Epoch 61, Loss: 2.4179, Val NDCG@10: 0.0620


Epoch 62/70: 100%|██████████| 38/38 [00:15<00:00,  2.41it/s]


Epoch 62, Loss: 2.4151, Val NDCG@10: 0.0626


Epoch 63/70: 100%|██████████| 38/38 [00:12<00:00,  2.99it/s]


Epoch 63, Loss: 2.4092, Val NDCG@10: 0.0649


Epoch 64/70: 100%|██████████| 38/38 [00:12<00:00,  3.09it/s]


Epoch 64, Loss: 2.4029, Val NDCG@10: 0.0656


Epoch 65/70: 100%|██████████| 38/38 [00:08<00:00,  4.70it/s]


Epoch 65, Loss: 2.4014, Val NDCG@10: 0.0648


Epoch 66/70: 100%|██████████| 38/38 [00:07<00:00,  5.14it/s]


Epoch 66, Loss: 2.3941, Val NDCG@10: 0.0626


Epoch 67/70: 100%|██████████| 38/38 [00:14<00:00,  2.60it/s]


Epoch 67, Loss: 2.3910, Val NDCG@10: 0.0643


Epoch 68/70: 100%|██████████| 38/38 [00:15<00:00,  2.44it/s]


Epoch 68, Loss: 2.3892, Val NDCG@10: 0.0653


Epoch 69/70: 100%|██████████| 38/38 [00:19<00:00,  1.95it/s]


Epoch 69, Loss: 2.3853, Val NDCG@10: 0.0661


Epoch 70/70: 100%|██████████| 38/38 [00:19<00:00,  1.93it/s]


Epoch 70, Loss: 2.3791, Val NDCG@10: 0.0676


In [12]:
# Загружаем лучшую модель
model.load_state_dict(torch.load("best_sasrec_model.pth"))

<All keys matched successfully>

# Аггрегации

In [13]:
# Загружаем лучшую модель
model.load_state_dict(torch.load("best_sasrec_model.pth"))

<All keys matched successfully>

In [14]:
# n_ext_users = 100 # Выбираем 100 случайных пользователей

# user_subset = np.random.choice(train_users, n_ext_users, replace=False)
# subset_data = ratings[ratings['user_id'].isin(user_subset)].sort_values(['user_id', 'timestamp'])

# # Создаем словарь для хранения эмбеддингов пользователей по времени
# user_embeddings = defaultdict(list)

# # Получаем эмбеддинги для каждого пользователя в каждый момент времени
# model.eval()
# with torch.no_grad():
#     for user_id in tqdm(user_subset, desc="Processing users"):
#         user_data = subset_data[subset_data['user_id'] == user_id]
#         user_items = user_data['item_id'].tolist()

#         for i in range(1, len(user_items) + 1):
#             seq = user_items[:i]
#             if len(seq) > model.max_len:
#                 seq = seq[-model.max_len:]
#             else:
#                 seq = [0] * (model.max_len - len(seq)) + seq

#             input_seq = torch.tensor([seq], dtype=torch.long).to(device)
#             embeddings = model.get_embeddings(input_seq)

#             # Берем последний эмбеддинг (для последнего элемента в последовательности)
#             last_embedding = embeddings[0, -1, :].cpu().numpy()
#             timestamp = user_data.iloc[i-1]['timestamp']

#             user_embeddings[user_id].append((timestamp, last_embedding))

In [15]:
n_ext_users = 500 # Выбираем 100 случайных пользователей

user_subset = np.random.choice(train_users, n_ext_users, replace=False)
subset_data = ratings[ratings['user_id'].isin(user_subset)].sort_values(['user_id', 'timestamp'])

# Создаем словарь для хранения эмбеддингов пользователей по времени
user_embeddings = defaultdict(list)

# Получаем эмбеддинги для каждого пользователя в каждый момент времени
model.eval()
with torch.no_grad():
    for user_id in tqdm(user_subset, desc="Processing users"):
        user_data = subset_data[subset_data['user_id'] == user_id]
        user_items = user_data['item_id'].tolist()
        timestamps = user_data['timestamp'].astype(int).tolist()

        if len(user_items) <= model.max_len:
            seq = [0] * (model.max_len - len(user_items)) + user_items
            input_seq = torch.tensor([seq], dtype=torch.long).to(device)
            embeddings = model.get_embeddings(input_seq).cpu().numpy()[0, :, :]

        else:
            for i in range(0, len(user_items) + 1, model.max_len):
                seq = user_items[i:model.max_len + i]
                input_seq = torch.tensor([seq], dtype=torch.long).to(device)
                batch_emb = model.get_embeddings(input_seq).cpu().numpy()[0, :, :]
                if i > 0:
                    embeddings = np.concatenate([embeddings, batch_emb], axis=0)
                else:
                    embeddings = batch_emb

        for time, embed in zip(timestamps, embeddings):
                user_embeddings[user_id].append((time, embed))


Processing users:   0%|          | 0/500 [00:00<?, ?it/s]

Processing users: 100%|██████████| 500/500 [02:05<00:00,  3.97it/s]


In [16]:
# Агрегируем эмбеддинги по времени
def aggregate_embeddings(embeddings_list, method='mean'):
    if not embeddings_list:
        return np.zeros(model.hidden_units)

    embeddings = np.array([emb[1] for emb in embeddings_list])
    if method == 'mean':
        return np.mean(embeddings, axis=0)
    elif method == 'max':
        return np.max(embeddings, axis=0)
    else:
        raise ValueError(f"Unknown aggregation method: {method}")

# Создаем таблицу с агрегированными эмбеддингами по времени
all_timestamps = sorted(set([ts for user_embs in user_embeddings.values() for ts, _ in user_embs]))


aggregated_embeddings = []

for timestamp in tqdm(all_timestamps, desc="Aggregating embeddings"):
    # Собираем последние эмбеддинги для каждого пользователя на данный момент времени
    current_embeddings = []
    for user_id, embeddings_list in user_embeddings.items():
        # Находим последний эмбеддинг пользователя до данного времени
        user_embs_before = [(ts, emb) for ts, emb in embeddings_list if ts <= timestamp]
        if user_embs_before:
            # Берем последний эмбеддинг
            last_emb = user_embs_before[-1][1]
            current_embeddings.append(last_emb)

    # Агрегируем эмбеддинги всех пользователей
    if current_embeddings:
        mean_agg = aggregate_embeddings([(timestamp, emb) for emb in current_embeddings], 'mean')
        max_agg = aggregate_embeddings([(timestamp, emb) for emb in current_embeddings], 'max')

        aggregated_embeddings.append({
            'timestamp': timestamp,
            'mean_embedding': mean_agg,
            'max_embedding': max_agg
        })

# Создаем DataFrame с агрегированными эмбеддингами
agg_emb_df = pd.DataFrame(aggregated_embeddings)

Aggregating embeddings: 100%|██████████| 27283/27283 [01:17<00:00, 352.68it/s]


In [17]:
agg_emb_df.head()

Unnamed: 0,timestamp,mean_embedding,max_embedding
0,962905362000000000,"[-1.1905193, -1.9180659, -1.3795913, -1.266162...","[-1.1905193, -1.9180659, -1.3795913, -1.266162..."
1,962905391000000000,"[-1.133453, -2.39427, -1.4291996, -0.7620805, ...","[-1.133453, -2.39427, -1.4291996, -0.7620805, ..."
2,962905432000000000,"[-1.5036978, -2.384849, -1.2538649, -0.8103050...","[-1.5036978, -2.384849, -1.2538649, -0.8103050..."
3,962905482000000000,"[-1.3773258, -2.294885, -1.415181, -1.0748416,...","[-1.3773258, -2.294885, -1.415181, -1.0748416,..."
4,962905586000000000,"[-1.4487029, -2.2257712, -1.1753342, -1.394096...","[-1.4487029, -2.2257712, -1.1753342, -1.394096..."


In [18]:
agg_emb_df = agg_emb_df.sort_values('timestamp')
time_list = agg_emb_df['timestamp'].values
ext_embeddings = agg_emb_df['mean_embedding'].values
ext_embeddings = np.stack(ext_embeddings)

external_features = time_list, ext_embeddings

In [19]:
model.add_external_features(external_features)

Все слои сети заморожены.


In [20]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

model = train_sasrec(model, train_loader, val_loader, optimizer, criterion, device, epochs=20)

Epoch 1/20: 100%|██████████| 38/38 [00:11<00:00,  3.27it/s]


Epoch 1, Loss: 5.1625, Val NDCG@10: 0.0139


Epoch 2/20: 100%|██████████| 38/38 [00:10<00:00,  3.73it/s]


Epoch 2, Loss: 2.9256, Val NDCG@10: 0.0315


Epoch 3/20: 100%|██████████| 38/38 [00:07<00:00,  5.37it/s]


Epoch 3, Loss: 2.7627, Val NDCG@10: 0.0439


Epoch 4/20: 100%|██████████| 38/38 [00:13<00:00,  2.74it/s]


Epoch 4, Loss: 2.6704, Val NDCG@10: 0.0512


Epoch 5/20: 100%|██████████| 38/38 [00:12<00:00,  2.95it/s]


Epoch 5, Loss: 2.6123, Val NDCG@10: 0.0556


Epoch 6/20: 100%|██████████| 38/38 [00:21<00:00,  1.80it/s]


Epoch 6, Loss: 2.5680, Val NDCG@10: 0.0610


Epoch 7/20: 100%|██████████| 38/38 [00:18<00:00,  2.06it/s]


Epoch 7, Loss: 2.5357, Val NDCG@10: 0.0641


Epoch 8/20: 100%|██████████| 38/38 [00:15<00:00,  2.39it/s]


Epoch 8, Loss: 2.5169, Val NDCG@10: 0.0642


Epoch 9/20: 100%|██████████| 38/38 [00:08<00:00,  4.27it/s]


Epoch 9, Loss: 2.4989, Val NDCG@10: 0.0662


Epoch 10/20: 100%|██████████| 38/38 [00:16<00:00,  2.31it/s]


Epoch 10, Loss: 2.4822, Val NDCG@10: 0.0654


Epoch 11/20: 100%|██████████| 38/38 [00:15<00:00,  2.50it/s]


Epoch 11, Loss: 2.4702, Val NDCG@10: 0.0669


Epoch 12/20: 100%|██████████| 38/38 [00:12<00:00,  3.00it/s]


Epoch 12, Loss: 2.4628, Val NDCG@10: 0.0656


Epoch 13/20: 100%|██████████| 38/38 [00:10<00:00,  3.74it/s]


Epoch 13, Loss: 2.4538, Val NDCG@10: 0.0654


Epoch 14/20: 100%|██████████| 38/38 [00:11<00:00,  3.17it/s]


Epoch 14, Loss: 2.4450, Val NDCG@10: 0.0673


Epoch 15/20: 100%|██████████| 38/38 [00:11<00:00,  3.23it/s]


Epoch 15, Loss: 2.4376, Val NDCG@10: 0.0646


Epoch 16/20: 100%|██████████| 38/38 [00:06<00:00,  6.24it/s]


Epoch 16, Loss: 2.4300, Val NDCG@10: 0.0661


Epoch 17/20: 100%|██████████| 38/38 [00:11<00:00,  3.44it/s]


Epoch 17, Loss: 2.4302, Val NDCG@10: 0.0674


Epoch 18/20: 100%|██████████| 38/38 [00:13<00:00,  2.82it/s]


Epoch 18, Loss: 2.4238, Val NDCG@10: 0.0640


Epoch 19/20: 100%|██████████| 38/38 [00:20<00:00,  1.82it/s]


Epoch 19, Loss: 2.4204, Val NDCG@10: 0.0659


Epoch 20/20: 100%|██████████| 38/38 [00:16<00:00,  2.24it/s]


Epoch 20, Loss: 2.4162, Val NDCG@10: 0.0660


In [21]:
def check_unfrozen_params(model):
    unfrozen_params = []
    for name, param in model.named_parameters():
        if param.requires_grad:
            unfrozen_params.append(name)
    return unfrozen_params

print("Размороженные параметры:", check_unfrozen_params(model))

Размороженные параметры: ['ext_head.weight', 'ext_head.bias']


# Boosting

In [22]:
# Добавляем расчет NDCG для бустинга
def calculate_ndcg_for_boosting(model, X, y, user_ids, k=10):
    """
    Вычисляет NDCG@k для модели бустинга, группируя предсказания по пользователям
    """
    # Получаем предсказания модели
    predictions = model.predict(X)

    # Создаем DataFrame с user_id, предсказаниями и истинными значениями
    results_df = pd.DataFrame({
        'user_id': user_ids,
        'prediction': predictions,
        'true_rating': y
    })

    # Группируем по пользователям
    ndcg_scores = []
    for user_id, group in results_df.groupby('user_id'):
        if len(group) < 2:
            continue  # Пропускаем пользователей с менее чем 2 взаимодействиями

        # Сортируем предсказания и истинные значения
        pred_sorted = group.sort_values('prediction', ascending=False)['true_rating'].values
        true_sorted = group.sort_values('true_rating', ascending=False)['true_rating'].values

        # Вычисляем NDCG
        ndcg = ndcg_score([true_sorted], [pred_sorted], k=k)
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores) if ndcg_scores else 0

In [23]:
boost_data = []
boost_labels = []
boost_user_ids = []  # Для хранения ID пользователей
boost_timestamps = []  # Для хранения временных меток

add_external = False

with torch.no_grad():
    for user_id in tqdm(ratings["user_id"].unique(), desc="Preparing boosting data"):
        user_data = subset_data[subset_data['user_id'] == user_id]
        user_items = user_data['item_id'].tolist()
        user_ratings = user_data['rating'].tolist()
        user_timestamps = user_data['timestamp'].tolist()

        for i in range(1, len(user_items)):
            # Получаем эмбеддинг пользователя
            seq = user_items[:i]
            if len(seq) > model.max_len:
                seq = seq[-model.max_len:]
            else:
                seq = [0] * (model.max_len - len(seq)) + seq

            input_seq = torch.tensor([seq], dtype=torch.long).to(device)
            user_embedding = model.get_embeddings(input_seq)[0, -1, :].cpu().numpy()

            # Получаем эмбеддинг айтема
            item_id = user_items[i]
            item_embedding = model.item_emb(torch.tensor([item_id], dtype=torch.long).to(device))[0].cpu().numpy()

            # Получаем агрегированный эмбеддинг контекста
            timestamp = user_data.iloc[i-1]['timestamp']
            meta_features = np.array([user_id, item_id, timestamp])
            n_meta_features = len(meta_features)

            if add_external:
                context_info = agg_emb_df[agg_emb_df['timestamp'] <= timestamp].iloc[-1]
                mean_context = context_info['mean_embedding']
                max_context = context_info['max_embedding']

                # Объединяем все эмбеддинги
                combined_features = np.concatenate([meta_features, user_embedding, mean_context, max_context, item_embedding])
            else:
                combined_features = np.concatenate([meta_features, user_embedding, item_embedding]) #mean_context, max_context, ])

            boost_data.append(combined_features)
            boost_labels.append(user_ratings[i])
            boost_user_ids.append(user_id)
            boost_timestamps.append(user_timestamps[i])

# Преобразуем в numpy массивы
X = np.array(boost_data)
y = np.array(boost_labels)
user_ids = np.array(boost_user_ids)
train_users_boost = np.array([data[0] in train_users for data in X])

# Разделяем на train/test
# split_idx = int(0.8 * len(X))
X_train, X_test = X[train_users_boost, n_meta_features:], X[~train_users_boost, n_meta_features:]
y_train, y_test = y[train_users_boost], y[~train_users_boost]
user_ids_train, user_ids_test = user_ids[train_users_boost], user_ids[~train_users_boost]

# Обучаем модель бустинга
print("Training LightGBM model...")
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05)
lgb_model.fit(X_train, y_train)

# Оцениваем модель с помощью RMSE
train_pred = lgb_model.predict(X_train)
test_pred = lgb_model.predict(X_test)

train_rmse = np.sqrt(np.mean((train_pred - y_train) ** 2))
test_rmse = np.sqrt(np.mean((test_pred - y_test) ** 2))

print(f"LightGBM Train RMSE: {train_rmse:.4f}")
print(f"LightGBM Test RMSE: {test_rmse:.4f}")

# Оцениваем модель с помощью NDCG
train_ndcg = calculate_ndcg_for_boosting(lgb_model, X_train, y_train, user_ids_train, k=10)
test_ndcg = calculate_ndcg_for_boosting(lgb_model, X_test, y_test, user_ids_test, k=10)

print(f"LightGBM Train NDCG@10: {train_ndcg:.4f}")
print(f"LightGBM Test NDCG@10: {test_ndcg:.4f}")

Preparing boosting data:   0%|          | 0/6038 [00:00<?, ?it/s]

Preparing boosting data: 100%|██████████| 6038/6038 [1:52:45<00:00,  1.12s/it]  


Training LightGBM model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32621
[LightGBM] [Info] Number of data points in the train set: 47430, number of used features: 128
[LightGBM] [Info] Start training from score 4.391777




ValueError: Found array with 0 sample(s) (shape=(0, 128)) while a minimum of 1 is required by LGBMRegressor.