In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from torch.nn import Embedding
import json
import os

In [4]:
DATA_PATH = "data"
users_df = pd.read_csv(os.path.join(DATA_PATH, "users.csv"))
items_df = pd.read_csv(os.path.join(DATA_PATH, "items.csv"))
reviews_df = pd.read_csv(os.path.join(DATA_PATH, "reviews.csv"))

print("users_df.shape =", users_df.shape)
print("items_df.shape =", items_df.shape)
print("reviews_df.shape =", reviews_df.shape)

users_df.shape = (3527, 17)
items_df.shape = (17184, 75)
reviews_df.shape = (63053, 4)


In [5]:
# Уникальные пользователи
unique_users = users_df["profile_url"].unique()
user2idx = {u: i for i, u in enumerate(unique_users)}
num_users = len(user2idx)

# Уникальные места
unique_items = items_df["detail_id"].unique()
item2idx = {it: i for i, it in enumerate(unique_items)}
num_items = len(item2idx)

print("num_users =", num_users)
print("num_items =", num_items)

edges_user = []
edges_item = []

num_users = 3527
num_items = 17184


In [6]:
for _, row in reviews_df.iterrows():
    user_url = row["profile_url"]
    item_id = row["detail_id"]
    if user_url in user2idx and item_id in item2idx:
        u_idx = user2idx[user_url]          # [0..num_users-1]
        i_idx = num_users + item2idx[item_id]  # [num_users..num_users+num_items-1]
        edges_user.append(u_idx)
        edges_item.append(i_idx)

# user->item
user_tensor = torch.tensor(edges_user, dtype=torch.long)
item_tensor = torch.tensor(edges_item, dtype=torch.long)

# Дублируем (item->user) для неориентированного графа
edge_user_to_item = torch.stack([user_tensor, item_tensor], dim=0)
edge_item_to_user = torch.stack([item_tensor, user_tensor], dim=0)

edge_index = torch.cat([edge_user_to_item, edge_item_to_user], dim=1)
# shape: (2, 2*E)

data = Data(edge_index=edge_index)

# Общее число вершин = num_users + num_items
data.num_nodes = num_users + num_items

print("edge_index.shape =", edge_index.shape)
print("data.num_nodes =", data.num_nodes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

edge_index.shape = torch.Size([2, 125516])
data.num_nodes = 20711


In [7]:
class LightGCNLayer(MessagePassing):
    def __init__(self):
        super().__init__(aggr='mean')
        # LightGCN по сути усредняет соседние эмбеддинги

    def forward(self, x, edge_index):
        # x: (num_nodes, embed_dim)
        # edge_index: (2, E)
        return self.propagate(edge_index, x=x)

    def message(self, x_j):
        # x_j = эмбеддинги соседей
        return x_j  # просто возвращаем (без трансформаций)

class LightGCN(torch.nn.Module):
    def __init__(self, num_users, num_items, embed_dim=64, n_layers=3):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embed_dim = embed_dim
        self.n_layers = n_layers

        # Создаём обучаемые эмбеддинги для всех вершин (user + item)
        # (num_users + num_items, embed_dim)
        self.embedding = Embedding(num_users + num_items, embed_dim)

        # Инициализируем
        torch.nn.init.normal_(self.embedding.weight, std=0.1)

        # Каждый слой LightGCN - это "усреднение" соседей
        self.gcn_layers = torch.nn.ModuleList([LightGCNLayer() for _ in range(n_layers)])

    def forward(self, edge_index):
        x0 = self.embedding.weight  # (num_nodes, embed_dim)
        # начальные эмбеддинги - E^(0)

        all_x = [x0]

        x = x0
        for gcn in self.gcn_layers:
            x = gcn(x, edge_index)  # (num_nodes, embed_dim)
            all_x.append(x)

        # LightGCN: суммируем (или усредняем) по всем слоям
        # E = E^(0) + E^(1) + ... + E^(n_layers)
        final_x = torch.stack(all_x, dim=0).mean(dim=0)  # (num_nodes, embed_dim)
        return final_x

    def predict_interaction(self, user_emb, item_emb):
        # LightGCN обычно делает предсказание = user_emb dot item_emb
        return (user_emb * item_emb).sum(dim=1)

In [8]:
import random

def sample_pos_neg_batch(edges_user, edges_item, num_users, num_items, batch_size=1024):
    idx = np.random.randint(low=0, high=len(edges_user), size=batch_size)
    batch_users = []
    batch_pos = []
    batch_neg = []

    for i in idx:
        u = edges_user[i]
        pos = edges_item[i] - num_users  # local index [0.. num_items-1]
        # Сэмплируем случайный neg != pos
        while True:
            neg = random.randint(0, num_items-1)
            if neg != pos:
                break
        batch_users.append(u)
        batch_pos.append(pos)
        batch_neg.append(neg)

    return (
        torch.tensor(batch_users, dtype=torch.long, device=device),
        torch.tensor(batch_pos, dtype=torch.long, device=device),
        torch.tensor(batch_neg, dtype=torch.long, device=device)
    )

model = LightGCN(num_users, num_items, embed_dim=64, n_layers=3).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

E = edge_index.shape[1] // 2  # число уникальных (user->item) рёбер (без дублирования)
edges_user_np = np.array(edges_user)
edges_item_np = np.array(edges_item)

def bpr_loss(user_emb, pos_emb, neg_emb): # BPR loss = -sum( log( sigma( (u dot pos) - (u dot neg) )) )
    pos_score = torch.sum(user_emb * pos_emb, dim=1)  # (batch,)
    neg_score = torch.sum(user_emb * neg_emb, dim=1)
    diff = pos_score - neg_score
    return -torch.log(torch.sigmoid(diff)).mean()

In [10]:
num_epochs = 5
batch_size = 1024
samples_per_epoch = 500  # сколько mini-batch за эпоху

model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for _ in range(samples_per_epoch):
        # 1) Сэмплируем (user, pos, neg)
        u_batch, pos_batch, neg_batch = sample_pos_neg_batch(
            edges_user_np, edges_item_np,
            num_users, num_items,
            batch_size
        )
        optimizer.zero_grad()

        all_embs = model(data.edge_index)  # (num_nodes, embed_dim)
        # user_emb = all_embs[u], item_emb = all_embs[num_users + i]
        user_emb = all_embs[u_batch]  # (batch_size, embed_dim)
        pos_emb = all_embs[num_users + pos_batch]  # (batch_size, embed_dim)
        neg_emb = all_embs[num_users + neg_batch]

        # 3) BPR loss
        loss_val = bpr_loss(user_emb, pos_emb, neg_emb)
        loss_val.backward()
        optimizer.step()

        total_loss += loss_val.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss={total_loss/samples_per_epoch:.4f}")

# Финальные эмбеддинги
model.eval()
with torch.no_grad():
    final_embs = model(data.edge_index)  # (num_users+num_items, embed_dim)

test_profile_file = os.path.join(DATA_PATH, "test_profile_urls.txt")
with open(test_profile_file, "r", encoding="utf-8") as f:
    test_profiles = [line.strip() for line in f]

# Обратный словарь для items
idx2item = {v: k for k, v in item2idx.items()}

recommendations_dict = {}
top_k = 5

for p_url in test_profiles:
    if p_url not in user2idx:
        # пользователь не встречался в train
        recommendations_dict[p_url] = []
        continue

    user_idx = user2idx[p_url]

    # Эмбеддинг пользователя
    user_emb = final_embs[user_idx].unsqueeze(0)  # (1, embed_dim)

    # Эмбеддинги всех items: [num_users.. num_users+num_items-1]
    items_global_idx = torch.arange(num_users, num_users + num_items, dtype=torch.long, device=device)
    items_emb = final_embs[items_global_idx]  # (num_items, embed_dim)

    # Скалярное произведение (1 x embed_dim) dot (num_items x embed_dim) => (num_items,)
    scores = torch.sum(user_emb * items_emb, dim=1)

    # Сортируем по убыванию
    sorted_indices = torch.argsort(scores, descending=True)
    top_indices = sorted_indices[:top_k]

    recommended_detail_ids = []
    for idx_ in top_indices.cpu().numpy():
        # idx_ это локальный индекс в [0..num_items-1] относительно items_emb
        detail_id = idx2item[idx_]
        recommended_detail_ids.append(int(detail_id))

    recommendations_dict[p_url] = recommended_detail_ids

Epoch 1/5, Loss=0.0376
Epoch 2/5, Loss=0.0307
Epoch 3/5, Loss=0.0252
Epoch 4/5, Loss=0.0204
Epoch 5/5, Loss=0.0172


In [11]:
# Сохраняем в JSON
with open("recommendations_lightgcn_1.json", "w", encoding="utf-8") as f:
    json.dump(recommendations_dict, f, ensure_ascii=False, indent=2)

print("Готово! Результат записан в recommendations_lightgcn_1.json.")

Готово! Результат записан в recommendations_lightgcn_1.json.
