# Исследование моделей рекомендаций

## Подготовка данных для обучения и тестов (global time split)

In [1]:
import pandas as pd


class Constants:
    USER_ID = "user_id"
    ITEM_ID = "item_id"
    TIMESTAMP = "time"

    TRANSACTIONS_PATH = "/Users/alfa/Documents/diplom/graphnn-recommendation-system/data/transactions_train.csv"
    CUSTOMERS_PATH = "/Users/alfa/Documents/diplom/graphnn-recommendation-system/data/customers.csv"
    ARTICLES_PATH = "/Users/alfa/Documents/diplom/graphnn-recommendation-system/data/articles.csv"

    RESULT_TRANSACTIONS_PATH = "/Users/alfa/Documents/diplom/graphnn-recommendation-system/data/processed_transactions_train.csv"
    RESULT_CUSTOMERS_PATH = "/Users/alfa/Documents/diplom/graphnn-recommendation-system/data/processed_customers_train.csv"
    RESULT_ARTICLES_PATH = "/Users/alfa/Documents/diplom/graphnn-recommendation-system/data/processed_articles_train.csv"


In [2]:
def prepare_balanced_dataset(
    transactions: pd.DataFrame,
    articles: pd.DataFrame,
    num_articles_per_type: int = 50,
    num_customers: int = 500,
    min_articles_per_user: int = 5,
    min_users_per_article: int = 5,
) -> tuple:
    """
    Формирует сбалансированный датасет:
    - одинаковое количество артикулов на каждый product_type_name (если возможно)
    - одинаковое количество взаимодействий с каждым product_type_name
    - отбирает пользователей с минимум min_articles_per_user покупками
    """

    # Считаем популярность товаров
    article_popularity = transactions['item_id'].value_counts()
    popular_articles = article_popularity[article_popularity >= min_users_per_article].index

    # Фильтруем articles по популярным
    filtered_articles = articles[articles['item_id'].isin(popular_articles)]

    # Оставляем только те product_type_name, где достаточно товаров
    type_counts = filtered_articles['product_type_name'].value_counts()
    eligible_types = type_counts[type_counts >= num_articles_per_type].index

    print(f"Всего типов с достаточным числом товаров: {len(eligible_types)}")

    # Формируем финальный список артикулов по каждому типу
    selected_items = []
    for pt in eligible_types:
        items_in_type = filtered_articles[filtered_articles['product_type_name'] == pt]['item_id'].sample(
            num_articles_per_type, random_state=42
        ).tolist()
        selected_items.extend(items_in_type)

    # Фильтруем транзакции по этим артиклам
    filtered_data = transactions[transactions['item_id'].isin(selected_items)]

    # Считаем покупки по пользователям
    user_purchase_counts = filtered_data['user_id'].value_counts()

    # Отбираем пользователей с минимум min_articles_per_user покупками
    eligible_users = user_purchase_counts[user_purchase_counts >= min_articles_per_user].index

    # Берём случайные num_customers из eligible_users
    selected_users = pd.Series(eligible_users).sample(
        min(num_customers, len(eligible_users)), random_state=42
    ).tolist()

    # Финальный датасет
    final_data = filtered_data[filtered_data['user_id'].isin(selected_users)].reset_index(drop=True)

    print(f"Всего отобрано product_type_name: {len(eligible_types)}")
    print(f"Отобрано товаров: {len(selected_items)}")
    print(f"Отобрано пользователей: {len(selected_users)}")
    print(f"Финальный размер данных: {len(final_data)} транзакций")

    return final_data, selected_users, selected_items, eligible_types.tolist()


In [3]:
transactions = pd.read_csv(Constants.TRANSACTIONS_PATH)
transactions = transactions.rename(
    columns={
        "t_dat": Constants.TIMESTAMP,
        "customer_id": Constants.USER_ID,
        "article_id": Constants.ITEM_ID,
    }
)
transactions[Constants.USER_ID] = transactions[Constants.USER_ID].astype(str)
transactions[Constants.ITEM_ID] = transactions[Constants.ITEM_ID].astype(str)

customers = pd.read_csv(Constants.CUSTOMERS_PATH)
customers = (
    customers.rename(
        columns={
            "customer_id": Constants.USER_ID,
            "Active": "is_active",
            "age": "age",
        }
    )
    .drop(columns=["FN", "postal_code"])
)
customers[Constants.USER_ID] = customers[Constants.USER_ID].astype(str)

articles = pd.read_csv(Constants.ARTICLES_PATH)
articles = (
    articles.rename(
        columns={"article_id": Constants.ITEM_ID}
    )
    [[
        Constants.ITEM_ID,
        "prod_name",
        "product_type_name",
        "product_group_name",
        "colour_group_name",
        "detail_desc",
    ]]
)
articles[Constants.ITEM_ID] = articles[Constants.ITEM_ID].astype(str)

In [4]:
filtered_transactions, selected_users, selected_items, eligible_types = prepare_balanced_dataset(
    transactions=transactions,
    articles=articles,
    num_articles_per_type=10,  # Увеличиваем для большего разнообразия
    num_customers=500,       # Целевое количество пользователей
    min_articles_per_user=3,   # Минимум 5 покупок на пользователя
    min_users_per_article=10   # Минимум 10 пользователей на товар
)

filtered_customers = customers[customers[Constants.USER_ID].isin(selected_users)]
filtered_customers = filtered_customers.reset_index(drop=True)
filtered_articles = articles[articles[Constants.ITEM_ID].isin(selected_items)]
filtered_articles = filtered_articles.reset_index(drop=True)

Всего типов с достаточным числом товаров: 90
Всего отобрано product_type_name: 90
Отобрано товаров: 900
Отобрано пользователей: 500
Финальный размер данных: 1897 транзакций


In [5]:
filtered_articles.head()

Unnamed: 0,item_id,prod_name,product_type_name,product_group_name,colour_group_name,detail_desc
0,130035001,Black Umbrella,Umbrella,Items,Black,Umbrella with a telescopic handle and matching...
1,156224002,Box 4p Socks,Unknown,Unknown,Black,Semi-matte socks with a short shaft. 20 denier.
2,174057038,FLEECE PYJAMA,Pyjama jumpsuit/playsuit,Nightwear,Light Pink,"All-in-one pyjamas in soft, patterned fleece t..."
3,187949028,Padded pyjama,Pyjama jumpsuit/playsuit,Nightwear,Dark Blue,Lightly padded all-in-one pyjamas in soft cott...
4,188183015,Spanx alot Swimsuit,Swimsuit,Swimwear,Dark Green,Fully lined shaping swimsuit that has a sculpt...


In [6]:
filtered_transactions.head()

Unnamed: 0,time,user_id,item_id,price,sales_channel_id
0,2018-09-21,c492583641b998d7bb4362933c02064d619eeceff3b8a4...,665095002,0.025407,1
1,2018-09-23,c8c7e97d83e298c32742a9f584a90f270561566c52d849...,640807002,0.06778,2
2,2018-09-24,1f56cb4d9cd7468412c6dac613ba84da67adbaa981247e...,293433001,0.016932,2
3,2018-09-24,1f56cb4d9cd7468412c6dac613ba84da67adbaa981247e...,293433001,0.016932,2
4,2018-09-24,1f56cb4d9cd7468412c6dac613ba84da67adbaa981247e...,293433001,0.016932,2


In [7]:
def enhance_dataset(transactions: pd.DataFrame, articles: pd.DataFrame) -> pd.DataFrame:
    """Дополнительная обработка датасета для улучшения рекомендаций"""
    
    # Добавляем информацию о товарах
    enriched_data = transactions.merge(
        articles[[Constants.ITEM_ID, 'product_type_name', 'product_group_name']],
        on=Constants.ITEM_ID,
        how='left'
    )
    
    # Преобразуем временные метки
    enriched_data[Constants.TIMESTAMP] = pd.to_datetime(enriched_data[Constants.TIMESTAMP])
    enriched_data['days_since_epoch'] = (enriched_data[Constants.TIMESTAMP] - pd.Timestamp('2018-01-01')).dt.days
    
    # Добавляем сезонность
    enriched_data['month'] = enriched_data[Constants.TIMESTAMP].dt.month
    enriched_data['season'] = enriched_data['month'] % 12 // 3 + 1
    
    # Удаляем дубликаты (одинаковые покупки одного товара одним пользователем в один день)
    enriched_data = enriched_data.drop_duplicates(
        subset=[Constants.USER_ID, Constants.ITEM_ID, Constants.TIMESTAMP]
    )
    
    return enriched_data

enhanced_transactions = enhance_dataset(filtered_transactions, filtered_articles)

In [8]:
def time_based_split(data: pd.DataFrame, 
                    user_col: str = Constants.USER_ID,
                    time_col: str = Constants.TIMESTAMP,
                    test_ratio: float = 0.2) -> tuple:
    """Глобальное временное разделение с сохранением последовательности для каждого пользователя"""
    
    # Сортируем по пользователю и времени
    data = data.sort_values([user_col, time_col])
    
    # Вычисляем точку разделения для каждого пользователя
    def split_user_group(df):
        split_idx = int(len(df) * (1 - test_ratio))
        train = df.iloc[:split_idx]
        test = df.iloc[split_idx:]
        return train, test
    
    # Группируем и разделяем
    grouped = data.groupby(user_col, group_keys=False)
    train_data = grouped.apply(lambda x: split_user_group(x)[0])
    test_data = grouped.apply(lambda x: split_user_group(x)[1])
    
    return train_data, test_data

train_data, test_data = time_based_split(enhanced_transactions)

In [9]:
train_data.shape, test_data.shape

((1018, 10), (518, 10))

In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score

class RecommendationMetricsDf:
    def __init__(self, recommendations_df, history_df, USER_ID, ITEM_ID):
        """
        recommendations_df: pd.DataFrame с колонками [USER_ID, ITEM_ID, 'score']
        history_df: pd.DataFrame с колонками [USER_ID, ITEM_ID]
        """
        self.recommendations_df = recommendations_df
        self.history_df = history_df
        self.USER_ID = USER_ID
        self.ITEM_ID = ITEM_ID

    def precision_at_k(self, k=10):
        precisions = []

        for user_id in self.recommendations_df[self.USER_ID].unique():
            recs = self.recommendations_df[self.recommendations_df[self.USER_ID] == user_id].sort_values('score', ascending=False).head(k)
            actual = set(self.history_df[self.history_df[self.USER_ID] == user_id][self.ITEM_ID])
            recommended = set(recs[self.ITEM_ID])

            if len(recommended) > 0:
                precision = len(actual & recommended) / len(recommended)
            else:
                precision = 0.0

            precisions.append(precision)

        return np.mean(precisions)

    def recall_at_k(self, k=10):
        recalls = []

        for user_id in self.recommendations_df[self.USER_ID].unique():
            recs = self.recommendations_df[self.recommendations_df[self.USER_ID] == user_id].sort_values('score', ascending=False).head(k)
            actual = set(self.history_df[self.history_df[self.USER_ID] == user_id][self.ITEM_ID])
            recommended = set(recs[self.ITEM_ID])

            if len(actual) > 0:
                recall = len(actual & recommended) / len(actual)
            else:
                recall = 0.0

            recalls.append(recall)

        return np.mean(recalls)

    def map_at_k(self, k=10):
        aps = []

        for user_id in self.recommendations_df[self.USER_ID].unique():
            recs = self.recommendations_df[self.recommendations_df[self.USER_ID] == user_id].sort_values('score', ascending=False).head(k)
            actual = set(self.history_df[self.history_df[self.USER_ID] == user_id][self.ITEM_ID])
            recommended = list(recs[self.ITEM_ID])

            if not actual:
                aps.append(0.0)
                continue

            precision_sum = 0.0
            num_hits = 0

            for i, item in enumerate(recommended, 1):
                if item in actual:
                    num_hits += 1
                    precision_sum += num_hits / i

            aps.append(precision_sum / min(len(actual), k))

        return np.mean(aps)

    def ndcg_at_k(self, k=10):
        ndcgs = []

        for user_id in self.recommendations_df[self.USER_ID].unique():
            recs = self.recommendations_df[self.recommendations_df[self.USER_ID] == user_id].sort_values('score', ascending=False).head(k)
            actual_items = set(self.history_df[self.history_df[self.USER_ID] == user_id][self.ITEM_ID])
            recommended_items = list(recs[self.ITEM_ID])

            if not actual_items:
                ndcgs.append(0.0)
                continue

            true_relevance = np.array([1 if item in actual_items else 0 for item in recommended_items])
            pred_scores = np.array(recs['score'])

            if true_relevance.sum() == 0:
                ndcgs.append(0.0)
            else:
                ndcgs.append(ndcg_score([true_relevance], [pred_scores]))

        return np.mean(ndcgs)

    def coverage(self, total_items):
        recommended_items = set(self.recommendations_df[self.ITEM_ID])
        return len(recommended_items) / total_items


# ALS

In [11]:
USER_ID = "user_id"
ITEM_ID = "item_id"

In [12]:
import pandas as pd
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sparse
import numpy as np

class ALSRecommender:
    def __init__(self, factors=100, iterations=15, regularization=0.01, alpha=40):
        self.factors = factors
        self.iterations = iterations
        self.regularization = regularization
        self.alpha = alpha  # Коэффициент доверия для неявных отзывов
        self.model = None
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
    
    def prepare_data(self, transactions, USER_ID, ITEM_ID):
        """Подготовка данных в формат user-item матрицы"""
        user_ids = self.user_encoder.fit_transform(transactions[USER_ID])
        item_ids = self.item_encoder.fit_transform(transactions[ITEM_ID])
        
        # Создаем CSR матрицу сразу
        user_item_matrix = sparse.csr_matrix(
            (np.ones(len(transactions)), (user_ids, item_ids)),
            shape=(len(self.user_encoder.classes_), len(self.item_encoder.classes_))
        )
        
        # Взвешивание BM25 (вернет CSR матрицу)
        weighted_matrix = bm25_weight(user_item_matrix, K1=100, B=0.8)
        
        # Убедимся, что матрица в CSR формате
        return weighted_matrix.tocsr() if not sparse.isspmatrix_csr(weighted_matrix) else weighted_matrix

    def train(self, train_data, test_data, USER_ID, ITEM_ID):
        """Обучение модели на предварительно разделенных данных"""
        # Обучаем кодировщики на полном наборе данных
        all_data = pd.concat([train_data, test_data])
        self.user_encoder.fit(all_data[USER_ID])
        self.item_encoder.fit(all_data[ITEM_ID])
        
        # Готовим матрицы
        train_matrix = self._prepare_matrix(train_data, USER_ID, ITEM_ID)
        test_matrix = self._prepare_matrix(test_data, USER_ID, ITEM_ID)

        # Обучаем модель
        self.model = AlternatingLeastSquares(
            factors=self.factors,
            iterations=self.iterations,
            regularization=self.regularization,
            random_state=42
        )

        self.model.fit(self.alpha * train_matrix, show_progress=True)

        return train_matrix, test_matrix
    
    def _prepare_matrix(self, data, USER_ID, ITEM_ID):
        """Внутренний метод для подготовки матрицы из данных"""
        user_ids = self.user_encoder.transform(data[USER_ID])
        item_ids = self.item_encoder.transform(data[ITEM_ID])
        
        matrix = sparse.csr_matrix(
            (np.ones(len(data)), (user_ids, item_ids)),
            shape=(len(self.user_encoder.classes_), len(self.item_encoder.classes_))
        )
        
        weighted_matrix = bm25_weight(matrix, K1=100, B=0.8)
        return weighted_matrix.tocsr() if not sparse.isspmatrix_csr(weighted_matrix) else weighted_matrix
    
    def recommend(self, user_ids, user_items=None, N=10, filter_already_liked_items=True):
        """Генерация рекомендаций для списка пользователей"""
        if isinstance(user_ids, str):
            user_ids = [user_ids]
        
        # Кодируем пользователей
        user_codes = self.user_encoder.transform(user_ids)
        
        # Получаем рекомендации
        recommendations = []
        for user_code in user_codes:
            recs = self.model.recommend(
                userid=user_code, 
                user_items=user_items,
                N=N, 
                filter_already_liked_items=filter_already_liked_items,
            )
            # Декодируем ID товаров
            item_ids = self.item_encoder.inverse_transform([r[0] for r in recs])
            scores = [r[1] for r in recs]
            recommendations.append(list(zip(item_ids, scores)))
        
        return recommendations if len(recommendations) > 1 else recommendations[0]


def train_model(train_data, test_data, USER_ID, ITEM_ID, factors=128, iterations=15, alpha=40):
    """Полный процесс обучения ALS модели на предварительно разделенных данных"""
    als = ALSRecommender(factors=factors, iterations=iterations, alpha=alpha)
    train_matrix, test_matrix = als.train(train_data, test_data, USER_ID, ITEM_ID)
    return als, train_matrix, test_matrix


def evaluate_model(als, train_matrix, test_matrix, articles_df, USER_ID, ITEM_ID, N=30):
    from scipy import sparse
    from tqdm import tqdm
    import pandas as pd

    print("\nГенерация рекомендаций...")
    recommendations = []

    train_matrix_csr = train_matrix.tocsr() if not sparse.isspmatrix_csr(train_matrix) else train_matrix

    for user_idx in tqdm(range(train_matrix_csr.shape[0])):
        user_vector = train_matrix_csr[user_idx]

        recs = als.model.recommend(
            userid=user_idx,
            user_items=user_vector,
            N=N,
            filter_already_liked_items=False
        )

        item_ids = als.item_encoder.inverse_transform(recs[0])
        scores = recs[1]
        user_id = als.user_encoder.inverse_transform([user_idx])[0]

        recs_df = pd.DataFrame({
            USER_ID: [user_id] * len(item_ids),
            ITEM_ID: item_ids,
            'score': scores
        })

        recommendations.append(recs_df)

    recommendations_df = pd.concat(recommendations, ignore_index=True)

    # Конвертируем test_matrix в DataFrame
    test_transactions_df = matrix_to_transactions_df(test_matrix, als.user_encoder, als.item_encoder, USER_ID, ITEM_ID)

    print("\nОценка качества рекомендаций:")
    metrics = RecommendationMetricsDf(
        recommendations_df,
        test_transactions_df,
        USER_ID,
        ITEM_ID
    )

    print(f"Precision@{N}: {metrics.precision_at_k(k=N):.4f}")
    print(f"Recall@{N}: {metrics.recall_at_k(k=N):.4f}")
    print(f"MAP@{N}: {metrics.map_at_k(k=N):.4f}")
    print(f"NDCG@{N}: {metrics.ndcg_at_k(k=N):.4f}")
    print(f"Coverage: {metrics.coverage(len(articles_df)):.2%}")

    return recommendations_df


def matrix_to_transactions_df(matrix, user_encoder, item_encoder, USER_ID, ITEM_ID):
    """Конвертация разреженной матрицы взаимодействий в DataFrame с user_id и item_id"""
    rows, cols = matrix.nonzero()

    user_ids = user_encoder.inverse_transform(rows)
    item_ids = item_encoder.inverse_transform(cols)

    transactions_df = pd.DataFrame({
        USER_ID: user_ids,
        ITEM_ID: item_ids
    })

    return transactions_df


In [13]:
# Определяем константы для колонок
USER_ID = "user_id"
ITEM_ID = "item_id"

# 1. Сначала нужно разделить данные на train и test
train_data, test_data = time_based_split(filtered_transactions)

# 2. Затем обучить модель
als, train_matrix, test_matrix = train_model(
    train_data=train_data,
    test_data=test_data,
    USER_ID=USER_ID,
    ITEM_ID=ITEM_ID,
    factors=32,
    iterations=30,
    alpha=0.01
)

# 3. Оценить модель
all_recommendations = evaluate_model(
    als, 
    train_matrix, 
    test_matrix, 
    filtered_articles, 
    USER_ID, 
    ITEM_ID, 
    N=10,
)

  check_blas_config()


  0%|          | 0/30 [00:00<?, ?it/s]


Генерация рекомендаций...


100%|██████████| 500/500 [00:00<00:00, 960.40it/s] 



Оценка качества рекомендаций:
Precision@10: 0.0306
Recall@10: 0.2930
MAP@10: 0.1876
NDCG@10: 0.2200
Coverage: 18.56%


## Bert4Rec

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import math
from tqdm import tqdm

class TransactionDataset(Dataset):
    def __init__(self, df, max_len=10):
        self.df = df.sort_values(['user_id', 'time']).copy()
        self.max_len = max_len
        
        # Инициализация энкодеров
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        
        self.user_encoder.fit(self.df['user_id'].unique())
        self.item_encoder.fit(self.df['item_id'].unique())
        
        # Преобразуем ID в числовые индексы (+1 для PAD)
        self.df['user_idx'] = self.user_encoder.transform(self.df['user_id'])
        self.df['item_idx'] = self.item_encoder.transform(self.df['item_id']) + 1
        
        # Группируем транзакции по пользователям
        self.user_sequences = self.df.groupby('user_idx')['item_idx'].apply(list).to_dict()
        
        # Специальные токены
        self.PAD = 0
        self.MASK = len(self.item_encoder.classes_) + 1
        self.CLS = len(self.item_encoder.classes_) + 2
        self.SEP = len(self.item_encoder.classes_) + 3
        
        self.vocab_size = len(self.item_encoder.classes_) + 4
        
    def __len__(self):
        return len(self.user_sequences)
    
    def __getitem__(self, idx):
        seq = self.user_sequences.get(idx, [])
        
        # Обрезаем последовательность если слишком длинная
        if len(seq) > self.max_len - 2:
            seq = seq[-(self.max_len - 2):]
        
        # Добавляем специальные токены
        seq = [self.CLS] + seq + [self.SEP]
        padding_len = self.max_len - len(seq)
        seq = seq + [self.PAD] * padding_len
        
        # Создаем маскированные данные
        masked_seq = seq.copy()
        labels = [self.PAD] * len(seq)
        
        for i in range(1, len(seq)-1):
            if seq[i] == self.PAD:
                continue
            if np.random.random() < 0.15:
                labels[i] = seq[i]
                p = np.random.random()
                if p < 0.8:
                    masked_seq[i] = self.MASK
                elif p < 0.9:
                    masked_seq[i] = np.random.randint(1, len(self.item_encoder.classes_)+1)
        
        return {
            'input_ids': torch.tensor(masked_seq, dtype=torch.long),
            'labels': torch.tensor(labels, dtype=torch.long),
            'attention_mask': torch.tensor([1 if x != self.PAD else 0 for x in seq], dtype=torch.long)
        }

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=10):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=2, num_layers=2, max_len=10):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        
        self.item_emb = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead,
            dim_feedforward=d_model*4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        
        self.output = nn.Linear(d_model, vocab_size)
        
    def forward(self, src, src_key_padding_mask=None):
        src = self.item_emb(src)
        src = self.pos_encoder(src)
        
        output = self.transformer(
            src, 
            src_key_padding_mask=src_key_padding_mask
        )
        
        return self.output(output)

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    total_samples = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        optimizer.zero_grad()
        
        logits = model(input_ids, src_key_padding_mask=(attention_mask == 0))
        
        loss = F.cross_entropy(
            logits.view(-1, model.vocab_size),
            labels.view(-1),
            ignore_index=0
        )
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * len(input_ids)
        total_samples += len(input_ids)
    
    return total_loss / total_samples if total_samples > 0 else 0

def predict_next_item(model, dataset, user_id, device, top_k=5):
    model.eval()
    
    user_idx = dataset.user_encoder.transform([user_id])[0]
    seq = dataset.user_sequences[user_idx]
    
    if len(seq) > dataset.max_len - 2:
        seq = seq[-(dataset.max_len - 2):]
    
    # Create the input sequence with CLS and SEP tokens
    input_seq = [dataset.CLS] + seq + [dataset.SEP]
    padding_len = dataset.max_len - len(input_seq)
    input_seq = input_seq + [dataset.PAD] * padding_len
    
    # Create attention mask
    attention_mask = torch.tensor([[1 if x != dataset.PAD else 0 for x in input_seq]], dtype=torch.long).to(device)
    
    # Replace the last non-padding item with MASK
    masked_seq = input_seq.copy()
    if len(seq) > 0:  # Only if there are items to predict
        last_item_pos = len([dataset.CLS] + seq)  # Position of the item before SEP
        masked_seq[last_item_pos] = dataset.MASK
    
    input_ids = torch.tensor([masked_seq], dtype=torch.long).to(device)
    
    with torch.no_grad():
        logits = model(input_ids, src_key_padding_mask=(attention_mask == 0))
        
        # Get the prediction for the masked position
        if len(seq) > 0:
            pred_pos = last_item_pos
        else:
            pred_pos = 1  # If sequence is empty, predict after CLS
        
        probs = F.softmax(logits[0, pred_pos], dim=-1)  # Prediction for the masked position
        
        # Exclude special tokens (PAD=0, MASK, CLS, SEP)
        valid_items = torch.arange(1, len(dataset.item_encoder.classes_)+1).to(device)
        valid_probs = probs[valid_items]
        
        # Ensure we don't request more items than available
        actual_top_k = min(top_k, len(valid_items))
        if actual_top_k <= 0:
            return np.array([]), np.array([])
            
        top_probs, top_indices = torch.topk(valid_probs, actual_top_k)
        top_items = valid_items[top_indices].cpu().numpy()
        
        return top_items, top_probs.cpu().numpy()


# Создаем датасет
max_len = 128
bert_dataset = TransactionDataset(filtered_transactions, max_len=max_len)
bert_train_loader = DataLoader(bert_dataset, batch_size=64, shuffle=False)

# Инициализируем модель
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert4rec_model = BERT4Rec(
    vocab_size=bert_dataset.vocab_size,
    d_model=128,
    nhead=2,
    num_layers=2,
    max_len=max_len,
).to(device)
bert_optimizer = torch.optim.Adam(bert4rec_model.parameters(), lr=0.001)

# Обучение
for epoch in range(1, 15):
    loss = train_epoch(bert4rec_model, bert_train_loader, bert_optimizer, device)
    print(f'Epoch {epoch}, Loss: {loss:.4f}')

Training: 100%|██████████| 8/8 [00:01<00:00,  7.01it/s]


Epoch 1, Loss: 5.3699


Training: 100%|██████████| 8/8 [00:01<00:00,  6.99it/s]


Epoch 2, Loss: 5.2604


Training: 100%|██████████| 8/8 [00:01<00:00,  6.48it/s]


Epoch 3, Loss: 5.0185


Training: 100%|██████████| 8/8 [00:01<00:00,  7.26it/s]


Epoch 4, Loss: 4.8731


Training: 100%|██████████| 8/8 [00:01<00:00,  7.44it/s]


Epoch 5, Loss: 4.8463


Training: 100%|██████████| 8/8 [00:01<00:00,  7.54it/s]


Epoch 6, Loss: 4.5531


Training: 100%|██████████| 8/8 [00:01<00:00,  7.24it/s]


Epoch 7, Loss: 4.6413


Training: 100%|██████████| 8/8 [00:01<00:00,  7.24it/s]


Epoch 8, Loss: 4.4465


Training: 100%|██████████| 8/8 [00:01<00:00,  7.37it/s]


Epoch 9, Loss: 4.2934


Training: 100%|██████████| 8/8 [00:01<00:00,  7.22it/s]


Epoch 10, Loss: 4.3296


Training: 100%|██████████| 8/8 [00:01<00:00,  7.02it/s]


Epoch 11, Loss: 4.1568


Training: 100%|██████████| 8/8 [00:01<00:00,  7.62it/s]


Epoch 12, Loss: 4.3108


Training: 100%|██████████| 8/8 [00:01<00:00,  6.96it/s]


Epoch 13, Loss: 4.0444


Training: 100%|██████████| 8/8 [00:01<00:00,  7.65it/s]

Epoch 14, Loss: 3.9847





In [15]:
def get_all_recommendations(model, dataset, device, top_k=10):
    """Генерирует рекомендации для всех пользователей"""
    model.eval()
    
    user_ids = dataset.user_encoder.classes_
    item_decoder = {idx+1: item_id for idx, item_id in enumerate(dataset.item_encoder.classes_)}  # item_idx -> item_id

    recommendations = []

    for user_id in tqdm(user_ids, desc="Generating recommendations"):
        top_items, top_probs = predict_next_item(model, dataset, user_id, device, top_k=top_k)

        for item_idx, score in zip(top_items, top_probs):
            if item_idx in item_decoder:
                item_id = item_decoder[item_idx]
                recommendations.append({
                    'user_id': user_id,
                    'item_id': item_id,
                    'score': score
                })

    rec_df = pd.DataFrame(recommendations)
    return rec_df


bert4rec_all_recommendations = get_all_recommendations(
    model=bert4rec_model,
    dataset=bert_dataset,
    device=device,
    top_k=10,
)

  output = torch._nested_tensor_from_mask(
Generating recommendations: 100%|██████████| 500/500 [00:00<00:00, 1006.64it/s]


In [16]:
# Расчет метрик
N = 10
test_transactions_df = matrix_to_transactions_df(test_matrix, als.user_encoder, als.item_encoder, USER_ID, ITEM_ID)
print("\nОценка качества рекомендаций:")
metrics = RecommendationMetricsDf(
    bert4rec_all_recommendations,
    test_transactions_df,
    USER_ID,
    ITEM_ID
)

print(f"Precision@{N}: {metrics.precision_at_k(k=N):.4f}")
print(f"Recall@{N}: {metrics.recall_at_k(k=N):.4f}")
print(f"MAP@{N}: {metrics.map_at_k(k=N):.4f}")
print(f"NDCG@{N}: {metrics.ndcg_at_k(k=N):.4f}")
print(f"Coverage: {metrics.coverage(len(filtered_articles)):.2%}")


Оценка качества рекомендаций:
Precision@10: 0.0330
Recall@10: 0.3130
MAP@10: 0.1804
NDCG@10: 0.2210
Coverage: 6.56%


## Graph Convolutional Network

In [17]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.nn import GCNConv
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from tqdm import tqdm


class GCNRecommender(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        
        self.user_embed = nn.Embedding(num_users, embedding_dim)
        self.item_embed = nn.Embedding(num_items, embedding_dim)
        
        self.conv1 = GCNConv(embedding_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        
    def forward(self, edge_index):
        user_emb = self.user_embed(torch.arange(self.num_users, device=edge_index.device))
        item_emb = self.item_embed(torch.arange(self.num_items, device=edge_index.device))
        x = torch.cat([user_emb, item_emb], dim=0)
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        
        user_final = x[:self.num_users]
        item_final = x[self.num_users:]
        
        return torch.sigmoid(torch.mm(user_final, item_final.t()))

def prepare_data(data: pd.DataFrame, 
                user_col: str = 'user_id',
                time_col: str = 'time',
                test_ratio: float = 0.2) -> tuple:
    """Подготовка данных с временным разделением для графовых сетей
    
    Args:
        data: DataFrame с колонками user_id, item_id, time
        user_col: название колонки с идентификаторами пользователей
        time_col: название колонки с временными метками
        test_ratio: доля тестовых данных
    
    Returns:
        Кортеж (train_data, test_data, user_encoder, item_encoder)
    """
    # Сортируем по пользователю и времени
    data = data.sort_values([user_col, time_col])
    
    # Функция для разделения данных пользователя
    def split_user_group(df):
        split_idx = int(len(df) * (1 - test_ratio))
        train = df.iloc[:split_idx]
        test = df.iloc[split_idx:]
        return train, test
    
    # Разделяем данные для каждого пользователя
    grouped = data.groupby(user_col, group_keys=False)
    train_data = grouped.apply(lambda x: split_user_group(x)[0])
    test_data = grouped.apply(lambda x: split_user_group(x)[1])
    
    # Кодируем пользователей и товары (только на train)
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    
    # Фитируем кодировщики на train данных
    train_data['user_idx'] = user_encoder.fit_transform(train_data[user_col])
    train_data['item_idx'] = item_encoder.fit_transform(train_data['item_id'])
    
    # Фильтруем тестовые данные (только известные пользователи и товары)
    test_data = test_data[test_data[user_col].isin(user_encoder.classes_)]
    test_data = test_data[test_data['item_id'].isin(item_encoder.classes_)]
    
    # Кодируем тестовые данные
    test_data['user_idx'] = user_encoder.transform(test_data[user_col])
    test_data['item_idx'] = item_encoder.transform(test_data['item_id'])
    
    # Проверяем, что в тестовых данных есть хотя бы одно взаимодействие
    if len(test_data) == 0:
        raise ValueError("Test data is empty after filtering. Check your data split.")
    
    return train_data, test_data, user_encoder, item_encoder

def train_model(train_data, num_users, num_items, epochs=50):
    # Создание графа
    edge_index = torch.tensor([
        train_data['user_idx'].values,
        train_data['item_idx'].values + num_users
    ], dtype=torch.long)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GCNRecommender(num_users, num_items).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    edge_index = edge_index.to(device)
    
    # Негативное сэмплирование
    def create_batch(data, batch_size=1024, neg_ratio=1):
        users = data['user_idx'].values
        items = data['item_idx'].values
        
        neg_users = np.repeat(users, neg_ratio)
        neg_items = np.random.choice(num_items, len(users)*neg_ratio)
        
        all_users = np.concatenate([users, neg_users])
        all_items = np.concatenate([items, neg_items])
        labels = np.concatenate([
            np.ones(len(users)),
            np.zeros(len(users)*neg_ratio)
        ])
        
        indices = np.random.permutation(len(all_users))
        for i in range(0, len(indices), batch_size):
            batch_idx = indices[i:i+batch_size]
            yield (
                torch.LongTensor(all_users[batch_idx]).to(device),
                torch.LongTensor(all_items[batch_idx]).to(device),
                torch.FloatTensor(labels[batch_idx]).to(device)
            )
    
    # Обучение
    for epoch in tqdm(range(epochs)):
        model.train()
        total_loss = 0
        
        for users, items, labels in create_batch(train_data):
            optimizer.zero_grad()
            preds = model(edge_index)[users, items]
            loss = criterion(preds, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch}: Loss = {total_loss:.4f}')
    
    return model

def generate_recommendations(model, user_encoder, item_encoder, k=10):
    model.eval()
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    # Используем оригинальный edge_index из обучения
    device = next(model.parameters()).device
    with torch.no_grad():
        preds = model(model.edge_index).cpu().numpy()
    
    recommendations = []
    for user_idx in range(num_users):
        user_id = user_encoder.inverse_transform([user_idx])[0]
        scores = preds[user_idx]
        top_items = np.argsort(-scores)[:k]
        
        for item_idx in top_items:
            item_id = item_encoder.inverse_transform([item_idx])[0]
            recommendations.append({
                'user_id': user_id,
                'item_id': item_id,
                'score': scores[item_idx]
            })
    
    return pd.DataFrame(recommendations)

# Полный пайплайн
if __name__ == "__main__":
    train_data, test_data, user_encoder, item_encoder = prepare_data(filtered_transactions)
    
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    model = train_model(train_data, num_users, num_items, epochs=30)
    model.edge_index = torch.tensor([
        train_data['user_idx'].values,
        train_data['item_idx'].values + num_users
    ], dtype=torch.long).to(next(model.parameters()).device)
    
    recommendations = generate_recommendations(model, user_encoder, item_encoder)
    
    # Расчет метрик
    metrics = RecommendationMetricsDf(recommendations, test_data, USER_ID, ITEM_ID)
    print(f"Precision@{N}: {metrics.precision_at_k(k=N):.4f}")
    print(f"Recall@{N}: {metrics.recall_at_k(k=N):.4f}")
    print(f"MAP@{N}: {metrics.map_at_k(k=N):.4f}")
    print(f"NDCG@{N}: {metrics.ndcg_at_k(k=N):.4f}")
    print(f"Coverage: {metrics.coverage(len(filtered_articles)):.2%}")


  edge_index = torch.tensor([
 40%|████      | 12/30 [00:00<00:00, 116.83it/s]

Epoch 0: Loss = 80.1835
Epoch 1: Loss = 40.7910
Epoch 2: Loss = 21.2131
Epoch 3: Loss = 16.3981
Epoch 4: Loss = 12.7614
Epoch 5: Loss = 14.5612
Epoch 6: Loss = 12.8535
Epoch 7: Loss = 11.5523
Epoch 8: Loss = 11.0745
Epoch 9: Loss = 13.7444
Epoch 10: Loss = 10.2415
Epoch 11: Loss = 8.9759
Epoch 12: Loss = 10.5023
Epoch 13: Loss = 8.3523
Epoch 14: Loss = 8.8585
Epoch 15: Loss = 8.0248
Epoch 16: Loss = 8.8913
Epoch 17: Loss = 6.4208
Epoch 18: Loss = 6.9595
Epoch 19: Loss = 6.9556
Epoch 20: Loss = 6.5580
Epoch 21: Loss = 7.2913
Epoch 22: Loss = 5.4166


100%|██████████| 30/30 [00:00<00:00, 107.69it/s]


Epoch 23: Loss = 6.0731
Epoch 24: Loss = 6.0455
Epoch 25: Loss = 5.2909
Epoch 26: Loss = 6.1634
Epoch 27: Loss = 4.9371
Epoch 28: Loss = 4.4870
Epoch 29: Loss = 5.2452
Precision@10: 0.0360
Recall@10: 0.3480
MAP@10: 0.2394
NDCG@10: 0.2705
Coverage: 39.11%


## GraphSAGE

In [18]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.nn import SAGEConv
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

class GraphSAGERecommender(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, hidden_dim=128):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        
        self.user_embed = nn.Embedding(num_users, embedding_dim)
        self.item_embed = nn.Embedding(num_items, embedding_dim)
        
        # Заменяем GCNConv на SAGEConv
        self.conv1 = SAGEConv(embedding_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        
    def forward(self, edge_index):
        user_emb = self.user_embed(torch.arange(self.num_users, device=edge_index.device))
        item_emb = self.item_embed(torch.arange(self.num_items, device=edge_index.device))
        x = torch.cat([user_emb, item_emb], dim=0)
        
        # Графовые свертки SAGE
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        
        user_final = x[:self.num_users]
        item_final = x[self.num_users:]
        
        return torch.sigmoid(torch.mm(user_final, item_final.t()))

def prepare_data(data: pd.DataFrame, 
                user_col: str = 'user_id',
                time_col: str = 'time',
                test_ratio: float = 0.2) -> tuple:
    """Подготовка данных с временным разделением для графовых сетей
    
    Args:
        data: DataFrame с колонками user_id, item_id, time
        user_col: название колонки с идентификаторами пользователей
        time_col: название колонки с временными метками
        test_ratio: доля тестовых данных
    
    Returns:
        Кортеж (train_data, test_data, user_encoder, item_encoder)
    """
    # Сортируем по пользователю и времени
    data = data.sort_values([user_col, time_col])
    
    # Функция для разделения данных пользователя
    def split_user_group(df):
        split_idx = int(len(df) * (1 - test_ratio))
        train = df.iloc[:split_idx]
        test = df.iloc[split_idx:]
        return train, test
    
    # Разделяем данные для каждого пользователя
    grouped = data.groupby(user_col, group_keys=False)
    train_data = grouped.apply(lambda x: split_user_group(x)[0])
    test_data = grouped.apply(lambda x: split_user_group(x)[1])
    
    # Кодируем пользователей и товары (только на train)
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    
    # Фитируем кодировщики на train данных
    train_data['user_idx'] = user_encoder.fit_transform(train_data[user_col])
    train_data['item_idx'] = item_encoder.fit_transform(train_data['item_id'])
    
    # Фильтруем тестовые данные (только известные пользователи и товары)
    test_data = test_data[test_data[user_col].isin(user_encoder.classes_)]
    test_data = test_data[test_data['item_id'].isin(item_encoder.classes_)]
    
    # Кодируем тестовые данные
    test_data['user_idx'] = user_encoder.transform(test_data[user_col])
    test_data['item_idx'] = item_encoder.transform(test_data['item_id'])
    
    # Проверяем, что в тестовых данных есть хотя бы одно взаимодействие
    if len(test_data) == 0:
        raise ValueError("Test data is empty after filtering. Check your data split.")
    
    return train_data, test_data, user_encoder, item_encoder

def train_model(train_data, num_users, num_items, epochs=50):
    """Модифицируем для GraphSAGE"""
    edge_index = torch.tensor([
        train_data['user_idx'].values,
        train_data['item_idx'].values + num_users
    ], dtype=torch.long)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GraphSAGERecommender(num_users, num_items).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  # Добавил регуляризацию
    criterion = nn.BCELoss()
    edge_index = edge_index.to(device)
    
    # Негативное сэмплирование (без изменений)
    def create_batch(data, batch_size=1024, neg_ratio=1):
        users = data['user_idx'].values
        items = data['item_idx'].values
        
        neg_users = np.repeat(users, neg_ratio)
        neg_items = np.random.choice(num_items, len(users)*neg_ratio)
        
        all_users = np.concatenate([users, neg_users])
        all_items = np.concatenate([items, neg_items])
        labels = np.concatenate([
            np.ones(len(users)),
            np.zeros(len(users)*neg_ratio)
        ])
        
        indices = np.random.permutation(len(all_users))
        for i in range(0, len(indices), batch_size):
            batch_idx = indices[i:i+batch_size]
            yield (
                torch.LongTensor(all_users[batch_idx]).to(device),
                torch.LongTensor(all_items[batch_idx]).to(device),
                torch.FloatTensor(labels[batch_idx]).to(device)
            )
    
    # Обучение с ранней остановкой
    best_loss = float('inf')
    patience = 5
    no_improve = 0
    
    for epoch in tqdm(range(epochs)):
        model.train()
        total_loss = 0
        
        for users, items, labels in create_batch(train_data):
            optimizer.zero_grad()
            preds = model(edge_index)[users, items]
            loss = criterion(preds, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_data)
        print(f'Epoch {epoch}: Loss = {avg_loss:.4f}')
        
        # Ранняя остановка
        if avg_loss < best_loss:
            best_loss = avg_loss
            no_improve = 0
            torch.save(model.state_dict(), 'best_graphsage_model.pth')
        else:
            no_improve += 1
            if no_improve >= patience:
                print("Early stopping triggered")
                break
    
    model.load_state_dict(torch.load('best_graphsage_model.pth'))
    return model

def generate_recommendations(model, user_encoder, item_encoder, k=10):
    """Аналогично вашему коду, без изменений"""
    model.eval()
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    device = next(model.parameters()).device
    with torch.no_grad():
        preds = model(model.edge_index).cpu().numpy()
    
    recommendations = []
    for user_idx in range(num_users):
        user_id = user_encoder.inverse_transform([user_idx])[0]
        scores = preds[user_idx]
        top_items = np.argsort(-scores)[:k]
        
        for item_idx in top_items:
            item_id = item_encoder.inverse_transform([item_idx])[0]
            recommendations.append({
                'user_id': user_id,
                'item_id': item_id,
                'score': scores[item_idx]
            })
    
    return pd.DataFrame(recommendations)

# Полный пайплайн
if __name__ == "__main__":
    # Подготовка данных
    train_data, test_data, user_encoder, item_encoder = prepare_data(filtered_transactions)
    
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    # Обучение GraphSAGE
    model = train_model(train_data, num_users, num_items, epochs=30)
    model.edge_index = torch.tensor([
        train_data['user_idx'].values,
        train_data['item_idx'].values + num_users
    ], dtype=torch.long).to(next(model.parameters()).device)
    
    # Генерация рекомендаций
    recommendations = generate_recommendations(model, user_encoder, item_encoder)
    
    # Расчет метрик
    metrics = RecommendationMetricsDf(recommendations, test_data, USER_ID, ITEM_ID)
    print(f"Precision@{N}: {metrics.precision_at_k(k=N):.4f}")
    print(f"Recall@{N}: {metrics.recall_at_k(k=N):.4f}")
    print(f"MAP@{N}: {metrics.map_at_k(k=N):.4f}")
    print(f"NDCG@{N}: {metrics.ndcg_at_k(k=N):.4f}")
    print(f"Coverage: {metrics.coverage(len(filtered_articles)):.2%}")


  0%|          | 0/30 [00:00<?, ?it/s]

Epoch 0: Loss = 0.0034
Epoch 1: Loss = 0.0023
Epoch 2: Loss = 0.0023
Epoch 3: Loss = 0.0021
Epoch 4: Loss = 0.0020


 73%|███████▎  | 22/30 [00:00<00:00, 108.64it/s]

Epoch 5: Loss = 0.0019
Epoch 6: Loss = 0.0018
Epoch 7: Loss = 0.0018
Epoch 8: Loss = 0.0017
Epoch 9: Loss = 0.0016
Epoch 10: Loss = 0.0015
Epoch 11: Loss = 0.0015
Epoch 12: Loss = 0.0015
Epoch 13: Loss = 0.0014
Epoch 14: Loss = 0.0014
Epoch 15: Loss = 0.0013
Epoch 16: Loss = 0.0013
Epoch 17: Loss = 0.0013
Epoch 18: Loss = 0.0012
Epoch 19: Loss = 0.0012
Epoch 20: Loss = 0.0013
Epoch 21: Loss = 0.0012
Epoch 22: Loss = 0.0012
Epoch 23: Loss = 0.0012
Epoch 24: Loss = 0.0011
Epoch 25: Loss = 0.0012
Epoch 26: Loss = 0.0011
Epoch 27: Loss = 0.0011


100%|██████████| 30/30 [00:00<00:00, 102.60it/s]


Epoch 28: Loss = 0.0011
Epoch 29: Loss = 0.0011
Precision@10: 0.0246
Recall@10: 0.2400
MAP@10: 0.1455
NDCG@10: 0.1721
Coverage: 37.67%


## LightGCN

In [19]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.nn import MessagePassing
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from typing import List, Optional

class LightGCNConv(MessagePassing):
    def __init__(self):
        super().__init__(aggr='add')

    def forward(self, x, edge_index):
        return self.propagate(edge_index, x=x)

    def message_and_aggregate(self, x_j):
        return x_j

class LightGCN(nn.Module):
    def __init__(self, num_users: int, num_items: int, embedding_dim: int = 64, 
                 n_layers: int = 3, keep_prob: float = 0.6):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.keep_prob = keep_prob
        
        # Инициализация эмбеддингов
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Слои LightGCN
        self.convs = nn.ModuleList([LightGCNConv() for _ in range(n_layers)])
        
        # Инициализация весов
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.normal_(self.user_embedding.weight, std=0.1)
        nn.init.normal_(self.item_embedding.weight, std=0.1)

    def forward(self, edge_index: torch.Tensor):
        # Получаем эмбеддинги пользователей и товаров
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight
        x = torch.cat([user_emb, item_emb], dim=0)
        
        # Создаем список эмбеддингов на каждом слое
        embeddings = [x]
        
        # Пропагация через слои LightGCN
        for i in range(self.n_layers):
            # Dropout для ребер
            if self.training and self.keep_prob < 1:
                edge_index = self.random_drop_edges(edge_index)
            
            x = self.convs[i](x, edge_index)
            embeddings.append(x)
        
        # Комбинируем эмбеддинги со всех слоев
        final_embeddings = torch.mean(torch.stack(embeddings, dim=0), dim=0)
        
        # Разделяем пользователей и товары
        user_final, item_final = torch.split(
            final_embeddings, [self.num_users, self.num_items]
        )
        
        # Вычисляем предсказания
        return torch.sigmoid(torch.mm(user_final, item_final.t()))

    def random_drop_edges(self, edge_index: torch.Tensor) -> torch.Tensor:
        if self.keep_prob >= 1:
            return edge_index
            
        num_edges = edge_index.size(1)
        mask = torch.rand(num_edges, device=edge_index.device) < self.keep_prob
        return edge_index[:, mask]

def prepare_data(data: pd.DataFrame, 
                user_col: str = 'user_id',
                time_col: str = 'time',
                test_ratio: float = 0.2) -> tuple:
    """Подготовка данных с временным разделением для графовых сетей
    
    Args:
        data: DataFrame с колонками user_id, item_id, time
        user_col: название колонки с идентификаторами пользователей
        time_col: название колонки с временными метками
        test_ratio: доля тестовых данных
    
    Returns:
        Кортеж (train_data, test_data, user_encoder, item_encoder)
    """
    # Сортируем по пользователю и времени
    data = data.sort_values([user_col, time_col])
    
    # Функция для разделения данных пользователя
    def split_user_group(df):
        split_idx = int(len(df) * (1 - test_ratio))
        train = df.iloc[:split_idx]
        test = df.iloc[split_idx:]
        return train, test
    
    # Разделяем данные для каждого пользователя
    grouped = data.groupby(user_col, group_keys=False)
    train_data = grouped.apply(lambda x: split_user_group(x)[0])
    test_data = grouped.apply(lambda x: split_user_group(x)[1])
    
    # Кодируем пользователей и товары (только на train)
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    
    # Фитируем кодировщики на train данных
    train_data['user_idx'] = user_encoder.fit_transform(train_data[user_col])
    train_data['item_idx'] = item_encoder.fit_transform(train_data['item_id'])
    
    # Фильтруем тестовые данные (только известные пользователи и товары)
    test_data = test_data[test_data[user_col].isin(user_encoder.classes_)]
    test_data = test_data[test_data['item_id'].isin(item_encoder.classes_)]
    
    # Кодируем тестовые данные
    test_data['user_idx'] = user_encoder.transform(test_data[user_col])
    test_data['item_idx'] = item_encoder.transform(test_data['item_id'])
    
    # Проверяем, что в тестовых данных есть хотя бы одно взаимодействие
    if len(test_data) == 0:
        raise ValueError("Test data is empty after filtering. Check your data split.")
    
    return train_data, test_data, user_encoder, item_encoder

def train_lightgcn(train_data: pd.DataFrame, num_users: int, num_items: int, 
                 epochs: int = 100, batch_size: int = 1024):
    # Создаем граф
    edge_index = torch.tensor([
        train_data['user_idx'].values,
        train_data['item_idx'].values + num_users
    ], dtype=torch.long)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = LightGCN(num_users, num_items, embedding_dim=64, n_layers=2, keep_prob=0.8).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    edge_index = edge_index.to(device)
    
    # Негативное сэмплирование
    def create_batch(data, batch_size=batch_size, neg_ratio=1):
        users = data['user_idx'].values
        items = data['item_idx'].values
        
        neg_users = np.repeat(users, neg_ratio)
        neg_items = np.random.choice(num_items, len(users)*neg_ratio)
        
        all_users = np.concatenate([users, neg_users])
        all_items = np.concatenate([items, neg_items])
        labels = np.concatenate([
            np.ones(len(users)),
            np.zeros(len(users)*neg_ratio)
        ])
        
        indices = np.random.permutation(len(all_users))
        for i in range(0, len(indices), batch_size):
            batch_idx = indices[i:i+batch_size]
            yield (
                torch.LongTensor(all_users[batch_idx]).to(device),
                torch.LongTensor(all_items[batch_idx]).to(device),
                torch.FloatTensor(labels[batch_idx]).to(device)
            )
    
    # Обучение с ранней остановкой
    best_auc = 0
    patience = 5
    no_improve = 0
    
    for epoch in tqdm(range(epochs)):
        model.train()
        total_loss = 0
        
        for users, items, labels in create_batch(train_data):
            optimizer.zero_grad()
            preds = model(edge_index)[users, items]
            loss = criterion(preds, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Оценка
        if epoch % 5 == 0:
            model.eval()
            with torch.no_grad():
                preds = model(edge_index).cpu()
                
                test_users = test_data['user_idx'].unique()
                if len(test_users) == 0:
                    continue
                    
                auc_scores = []
                for user in test_users:
                    pos_items = test_data[test_data['user_idx'] == user]['item_idx'].values
                    if len(pos_items) == 0:
                        continue
                        
                    neg_items = np.random.choice(num_items, min(99, num_items), replace=False)
                    all_items = np.concatenate([pos_items, neg_items])
                    labels = np.concatenate([np.ones(len(pos_items)), np.zeros(len(neg_items))])
                    
                    user_preds = preds[user, all_items].numpy()
                    auc_scores.append(roc_auc_score(labels, user_preds))
                
                current_auc = np.mean(auc_scores) if auc_scores else 0.0
                if current_auc > best_auc:
                    best_auc = current_auc
                    no_improve = 0
                    # Сохраняем только параметры модели (без edge_index)
                    torch.save({
                        'state_dict': model.state_dict(),
                        'num_users': num_users,
                        'num_items': num_items
                    }, 'best_lightgcn_model.pth')
                else:
                    no_improve += 1
                    if no_improve >= patience:
                        print("Early stopping triggered")
                        break
                
                print(f'Epoch {epoch}: Loss = {total_loss:.4f}, Test AUC = {current_auc:.4f}')
    
    # Загружаем лучшую модель
    checkpoint = torch.load('best_lightgcn_model.pth')
    model = LightGCN(
        checkpoint['num_users'], 
        checkpoint['num_items'],
        embedding_dim=64, 
        n_layers=2, 
        keep_prob=0.8
    ).to(device)
    model.load_state_dict(checkpoint['state_dict'])
    model.edge_index = edge_index  # Сохраняем edge_index в модели
    
    return model

def generate_recommendations(model, user_encoder, item_encoder, k=10):
    """Генерация рекомендаций (без изменений)"""
    model.eval()
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    device = next(model.parameters()).device
    with torch.no_grad():
        preds = model(model.edge_index).cpu().numpy()
    
    recommendations = []
    for user_idx in range(num_users):
        user_id = user_encoder.inverse_transform([user_idx])[0]
        scores = preds[user_idx]
        top_items = np.argsort(-scores)[:k]
        
        for item_idx in top_items:
            item_id = item_encoder.inverse_transform([item_idx])[0]
            recommendations.append({
                'user_id': user_id,
                'item_id': item_id,
                'score': scores[item_idx]
            })
    
    return pd.DataFrame(recommendations)

# Полный пайплайн LightGCN
if __name__ == "__main__":
    # Подготовка данных
    train_data, test_data, user_encoder, item_encoder = prepare_data(filtered_transactions)
    
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    # Обучение LightGCN
    model = train_lightgcn(train_data, num_users, num_items, epochs=100)
    
    # Генерация рекомендаций
    recommendations = generate_recommendations(model, user_encoder, item_encoder)
    
    # Расчет метрик
    metrics = RecommendationMetricsDf(recommendations, test_data, USER_ID, ITEM_ID)
    print(f"Precision@{N}: {metrics.precision_at_k(k=N):.4f}")
    print(f"Recall@{N}: {metrics.recall_at_k(k=N):.4f}")
    print(f"MAP@{N}: {metrics.map_at_k(k=N):.4f}")
    print(f"NDCG@{N}: {metrics.ndcg_at_k(k=N):.4f}")
    print(f"Coverage: {metrics.coverage(len(filtered_articles)):.2%}")

  1%|          | 1/100 [00:00<00:22,  4.37it/s]

Epoch 0: Loss = 2.0117, Test AUC = 0.7409


  6%|▌         | 6/100 [00:00<00:06, 13.80it/s]

Epoch 5: Loss = 1.9883, Test AUC = 0.7485


 11%|█         | 11/100 [00:00<00:05, 17.01it/s]

Epoch 10: Loss = 1.9587, Test AUC = 0.7561


 16%|█▌        | 16/100 [00:00<00:04, 18.45it/s]

Epoch 15: Loss = 1.9223, Test AUC = 0.7618


 21%|██        | 21/100 [00:01<00:04, 19.35it/s]

Epoch 20: Loss = 1.8849, Test AUC = 0.7645


 26%|██▌       | 26/100 [00:01<00:03, 19.81it/s]

Epoch 25: Loss = 1.8475, Test AUC = 0.7639


 31%|███       | 31/100 [00:01<00:03, 20.22it/s]

Epoch 30: Loss = 1.8017, Test AUC = 0.7641


 36%|███▌      | 36/100 [00:01<00:03, 20.24it/s]

Epoch 35: Loss = 1.7606, Test AUC = 0.7663


 41%|████      | 41/100 [00:02<00:02, 20.26it/s]

Epoch 40: Loss = 1.7261, Test AUC = 0.7641


 46%|████▌     | 46/100 [00:02<00:02, 20.53it/s]

Epoch 45: Loss = 1.6953, Test AUC = 0.7650


 51%|█████     | 51/100 [00:02<00:02, 20.68it/s]

Epoch 50: Loss = 1.6469, Test AUC = 0.7634


 56%|█████▌    | 56/100 [00:02<00:02, 20.72it/s]

Epoch 55: Loss = 1.6142, Test AUC = 0.7619


 60%|██████    | 60/100 [00:03<00:02, 19.21it/s]

Early stopping triggered





Precision@10: 0.0444
Recall@10: 0.4290
MAP@10: 0.2613
NDCG@10: 0.3087
Coverage: 33.78%


## NGCF

In [20]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.nn import MessagePassing
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from typing import List, Optional, Tuple

class NGCFLayer(MessagePassing):
    def __init__(self, in_dim: int, out_dim: int):
        super().__init__(aggr='add')
        self.in_dim = in_dim
        self.out_dim = out_dim
        
        # Weight matrices for message passing
        self.W1 = nn.Linear(in_dim, out_dim)
        self.W2 = nn.Linear(in_dim, out_dim)
        
        # LeakyReLU for non-linearity
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.2)
        
        # Dropout layer
        self.dropout = nn.Dropout(0.1)
        
        # Initialize weights
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.W1.weight)
        nn.init.xavier_uniform_(self.W2.weight)
    
    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        return self.propagate(edge_index, x=x)
    
    def message(self, x_j: torch.Tensor) -> torch.Tensor:
        # Message computation with dropout
        return self.dropout(x_j)
    
    def update(self, aggr_out: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
        # NGCF combination rule
        out = self.leaky_relu(self.W1(aggr_out)) + self.leaky_relu(self.W2(x))
        return out

class NGCF(nn.Module):
    def __init__(self, num_users: int, num_items: int, 
                 embedding_dim: int = 64, 
                 layer_dims: List[int] = [64, 64, 64],
                 dropout: float = 0.1,
                 node_dropout: float = 0.1):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.layer_dims = layer_dims
        self.n_layers = len(layer_dims)
        self.dropout = dropout
        self.node_dropout = node_dropout
        
        # User and item embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # NGCF layers
        self.convs = nn.ModuleList()
        input_dim = embedding_dim
        for output_dim in layer_dims:
            self.convs.append(NGCFLayer(input_dim, output_dim))
            input_dim = output_dim
        
        # Prediction layer - удаляем, так как будем использовать dot product
        # self.predict_layer = nn.Linear(layer_dims[-1], 1)
        
        # Initialize weights
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)
        # nn.init.xavier_uniform_(self.predict_layer.weight)
    
    def forward(self, edge_index: torch.Tensor) -> torch.Tensor:
        # Node dropout for regularization
        if self.training and self.node_dropout > 0:
            mask = torch.rand(edge_index.size(1)) >= self.node_dropout
            edge_index = edge_index[:, mask]
        
        # Initial embeddings
        user_emb = self.user_embedding.weight
        item_emb = self.item_embedding.weight
        x = torch.cat([user_emb, item_emb], dim=0)
        
        # List to store embeddings at each layer
        embeddings = [x]
        
        # Message passing through all layers
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.dropout(x, p=self.dropout, training=self.training)
            embeddings.append(x)
        
        # Combine embeddings from all layers
        # Изменяем способ комбинирования эмбеддингов
        final_embeddings = torch.mean(torch.stack(embeddings, dim=0), dim=0)
        
        # Split into users and items
        user_final, item_final = torch.split(
            final_embeddings, [self.num_users, self.num_items]
        )
        
        # Calculate predictions using dot product
        preds = torch.sigmoid(torch.mm(user_final, item_final.t()))
        return preds

def prepare_data(data: pd.DataFrame, 
                user_col: str = 'user_id',
                time_col: str = 'time',
                test_ratio: float = 0.2) -> tuple:
    """Подготовка данных с временным разделением для графовых сетей
    
    Args:
        data: DataFrame с колонками user_id, item_id, time
        user_col: название колонки с идентификаторами пользователей
        time_col: название колонки с временными метками
        test_ratio: доля тестовых данных
    
    Returns:
        Кортеж (train_data, test_data, user_encoder, item_encoder)
    """
    # Сортируем по пользователю и времени
    data = data.sort_values([user_col, time_col])
    
    # Функция для разделения данных пользователя
    def split_user_group(df):
        split_idx = int(len(df) * (1 - test_ratio))
        train = df.iloc[:split_idx]
        test = df.iloc[split_idx:]
        return train, test
    
    # Разделяем данные для каждого пользователя
    grouped = data.groupby(user_col, group_keys=False)
    train_data = grouped.apply(lambda x: split_user_group(x)[0])
    test_data = grouped.apply(lambda x: split_user_group(x)[1])
    
    # Кодируем пользователей и товары (только на train)
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    
    # Фитируем кодировщики на train данных
    train_data['user_idx'] = user_encoder.fit_transform(train_data[user_col])
    train_data['item_idx'] = item_encoder.fit_transform(train_data['item_id'])
    
    # Фильтруем тестовые данные (только известные пользователи и товары)
    test_data = test_data[test_data[user_col].isin(user_encoder.classes_)]
    test_data = test_data[test_data['item_id'].isin(item_encoder.classes_)]
    
    # Кодируем тестовые данные
    test_data['user_idx'] = user_encoder.transform(test_data[user_col])
    test_data['item_idx'] = item_encoder.transform(test_data['item_id'])
    
    # Проверяем, что в тестовых данных есть хотя бы одно взаимодействие
    if len(test_data) == 0:
        raise ValueError("Test data is empty after filtering. Check your data split.")
    
    return train_data, test_data, user_encoder, item_encoder

def train_ngcf(train_data: pd.DataFrame, num_users: int, num_items: int, 
              epochs: int = 100, batch_size: int = 1024) -> nn.Module:
    # Создаем граф
    edge_index = torch.tensor([
        train_data['user_idx'].values,
        train_data['item_idx'].values + num_users
    ], dtype=torch.long)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = NGCF(num_users, num_items).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    criterion = nn.BCELoss()
    edge_index = edge_index.to(device)
    
    # Негативное сэмплирование
    def create_batch(data, batch_size=batch_size, neg_ratio=1):
        users = data['user_idx'].values
        items = data['item_idx'].values
        
        neg_users = np.repeat(users, neg_ratio)
        neg_items = np.random.choice(num_items, len(users)*neg_ratio)
        
        all_users = np.concatenate([users, neg_users])
        all_items = np.concatenate([items, neg_items])
        labels = np.concatenate([
            np.ones(len(users)),
            np.zeros(len(users)*neg_ratio)
        ])
        
        indices = np.random.permutation(len(all_users))
        for i in range(0, len(indices), batch_size):
            batch_idx = indices[i:i+batch_size]
            yield (
                torch.LongTensor(all_users[batch_idx]).to(device),
                torch.LongTensor(all_items[batch_idx]).to(device),
                torch.FloatTensor(labels[batch_idx]).to(device)
            )
    
    # Обучение с ранней остановкой
    best_auc = 0
    patience = 5
    no_improve = 0
    
    for epoch in tqdm(range(epochs)):
        model.train()
        total_loss = 0
        
        for users, items, labels in create_batch(train_data):
            optimizer.zero_grad()
            preds = model(edge_index)[users, items]
            loss = criterion(preds, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Оценка
        if epoch % 5 == 0:
            model.eval()
            with torch.no_grad():
                preds = model(edge_index).cpu()
                
                test_users = test_data['user_idx'].unique()
                if len(test_users) == 0:
                    continue
                    
                auc_scores = []
                for user in test_users:
                    pos_items = test_data[test_data['user_idx'] == user]['item_idx'].values
                    if len(pos_items) == 0:
                        continue
                        
                    neg_items = np.random.choice(num_items, min(99, num_items), replace=False)
                    all_items = np.concatenate([pos_items, neg_items])
                    labels = np.concatenate([np.ones(len(pos_items)), np.zeros(len(neg_items))])
                    
                    user_preds = preds[user, all_items].numpy()
                    auc_scores.append(roc_auc_score(labels, user_preds))
                
                current_auc = np.mean(auc_scores) if auc_scores else 0.0
                if current_auc > best_auc:
                    best_auc = current_auc
                    no_improve = 0
                    torch.save(model.state_dict(), 'best_ngcf_model.pth')
                else:
                    no_improve += 1
                    if no_improve >= patience:
                        print("Early stopping triggered")
                        break
                
                print(f'Epoch {epoch}: Loss = {total_loss:.4f}, Test AUC = {current_auc:.4f}')
    
    model.load_state_dict(torch.load('best_ngcf_model.pth'))
    model.edge_index = edge_index  # Сохраняем edge_index в модели
    return model

def generate_recommendations(model: nn.Module, 
                           user_encoder: LabelEncoder, 
                           item_encoder: LabelEncoder, 
                           k: int = 10) -> pd.DataFrame:
    """Генерация рекомендаций (без изменений)"""
    model.eval()
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    device = next(model.parameters()).device
    with torch.no_grad():
        preds = model(model.edge_index).cpu().numpy()
    
    recommendations = []
    for user_idx in range(num_users):
        user_id = user_encoder.inverse_transform([user_idx])[0]
        scores = preds[user_idx]
        top_items = np.argsort(-scores)[:k]
        
        for item_idx in top_items:
            item_id = item_encoder.inverse_transform([item_idx])[0]
            recommendations.append({
                'user_id': user_id,
                'item_id': item_id,
                'score': scores[item_idx]
            })
    
    return pd.DataFrame(recommendations)

# Полный пайплайн NGCF
if __name__ == "__main__":
    # Подготовка данных
    train_data, test_data, user_encoder, item_encoder = prepare_data(filtered_transactions)
    
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    # Обучение NGCF
    model = train_ngcf(train_data, num_users, num_items, epochs=30)
    
    # Генерация рекомендаций
    recommendations = generate_recommendations(model, user_encoder, item_encoder)
    
    # Расчет метрик
    metrics = RecommendationMetricsDf(recommendations, test_data, USER_ID, ITEM_ID)
    print(f"Precision@{N}: {metrics.precision_at_k(k=N):.4f}")
    print(f"Recall@{N}: {metrics.recall_at_k(k=N):.4f}")
    print(f"MAP@{N}: {metrics.map_at_k(k=N):.4f}")
    print(f"NDCG@{N}: {metrics.ndcg_at_k(k=N):.4f}")
    print(f"Coverage: {metrics.coverage(len(filtered_articles)):.2%}")

  3%|▎         | 1/30 [00:00<00:07,  4.12it/s]

Epoch 0: Loss = 2.0135, Test AUC = 0.7124


 20%|██        | 6/30 [00:00<00:02, 11.95it/s]

Epoch 5: Loss = 1.9792, Test AUC = 0.7178


 37%|███▋      | 11/30 [00:00<00:01, 14.70it/s]

Epoch 10: Loss = 1.9491, Test AUC = 0.7238


 53%|█████▎    | 16/30 [00:01<00:00, 15.65it/s]

Epoch 15: Loss = 1.9239, Test AUC = 0.7244


 70%|███████   | 21/30 [00:01<00:00, 16.42it/s]

Epoch 20: Loss = 1.9182, Test AUC = 0.7346


100%|██████████| 30/30 [00:01<00:00, 17.25it/s]

Epoch 25: Loss = 1.8919, Test AUC = 0.7461





Precision@10: 0.0176
Recall@10: 0.1710
MAP@10: 0.0673
NDCG@10: 0.0938
Coverage: 14.22%


## GAT (Graph Attention Network)

In [21]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.nn import GATConv
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from typing import List, Optional, Tuple

class GATRecommender(nn.Module):
    def __init__(self, num_users: int, num_items: int, 
                 embedding_dim: int = 64, 
                 hidden_dim: int = 128,
                 heads: int = 4,
                 dropout: float = 0.2):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.heads = heads
        self.dropout = dropout
        
        # User and item embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # GAT layers
        self.conv1 = GATConv(
            embedding_dim, 
            hidden_dim // heads, 
            heads=heads,
            dropout=dropout
        )
        self.conv2 = GATConv(
            hidden_dim, 
            hidden_dim, 
            heads=1,
            dropout=dropout
        )
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        
        # Initialize weights
        self.reset_parameters()
    
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)
    
    def forward(self, edge_index: torch.Tensor) -> torch.Tensor:
        # Get initial embeddings
        user_emb = self.user_embedding(torch.arange(self.num_users, device=edge_index.device))
        item_emb = self.item_embedding(torch.arange(self.num_items, device=edge_index.device))
        x = torch.cat([user_emb, item_emb], dim=0)
        
        # First GAT layer
        x = F.elu(self.conv1(x, edge_index))
        x = self.dropout(x)
        
        # Second GAT layer
        x = self.conv2(x, edge_index)
        
        # Split into users and items
        user_final = x[:self.num_users]
        item_final = x[self.num_users:]
        
        # Calculate predictions using dot product + sigmoid
        return torch.sigmoid(torch.mm(user_final, item_final.t()))

def prepare_data(data: pd.DataFrame, 
                user_col: str = 'user_id',
                time_col: str = 'time',
                test_ratio: float = 0.2) -> tuple:
    """Подготовка данных с временным разделением для графовых сетей
    
    Args:
        data: DataFrame с колонками user_id, item_id, time
        user_col: название колонки с идентификаторами пользователей
        time_col: название колонки с временными метками
        test_ratio: доля тестовых данных
    
    Returns:
        Кортеж (train_data, test_data, user_encoder, item_encoder)
    """
    # Сортируем по пользователю и времени
    data = data.sort_values([user_col, time_col])
    
    # Функция для разделения данных пользователя
    def split_user_group(df):
        split_idx = int(len(df) * (1 - test_ratio))
        train = df.iloc[:split_idx]
        test = df.iloc[split_idx:]
        return train, test
    
    # Разделяем данные для каждого пользователя
    grouped = data.groupby(user_col, group_keys=False)
    train_data = grouped.apply(lambda x: split_user_group(x)[0])
    test_data = grouped.apply(lambda x: split_user_group(x)[1])
    
    # Кодируем пользователей и товары (только на train)
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    
    # Фитируем кодировщики на train данных
    train_data['user_idx'] = user_encoder.fit_transform(train_data[user_col])
    train_data['item_idx'] = item_encoder.fit_transform(train_data['item_id'])
    
    # Фильтруем тестовые данные (только известные пользователи и товары)
    test_data = test_data[test_data[user_col].isin(user_encoder.classes_)]
    test_data = test_data[test_data['item_id'].isin(item_encoder.classes_)]
    
    # Кодируем тестовые данные
    test_data['user_idx'] = user_encoder.transform(test_data[user_col])
    test_data['item_idx'] = item_encoder.transform(test_data['item_id'])
    
    # Проверяем, что в тестовых данных есть хотя бы одно взаимодействие
    if len(test_data) == 0:
        raise ValueError("Test data is empty after filtering. Check your data split.")
    
    return train_data, test_data, user_encoder, item_encoder

def train_gat(train_data: pd.DataFrame, num_users: int, num_items: int, 
             epochs: int = 100, batch_size: int = 1024) -> nn.Module:
    # Создаем граф
    edge_index = torch.tensor([
        train_data['user_idx'].values,
        train_data['item_idx'].values + num_users
    ], dtype=torch.long)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = GATRecommender(
        num_users, 
        num_items,
        embedding_dim=64,
        hidden_dim=128,
        heads=4,
        dropout=0.2
    ).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    criterion = nn.BCELoss()
    edge_index = edge_index.to(device)
    
    # Негативное сэмплирование
    def create_batch(data, batch_size=batch_size, neg_ratio=2):
        users = data['user_idx'].values
        items = data['item_idx'].values
        
        neg_users = np.repeat(users, neg_ratio)
        neg_items = np.random.choice(num_items, len(users)*neg_ratio)
        
        all_users = np.concatenate([users, neg_users])
        all_items = np.concatenate([items, neg_items])
        labels = np.concatenate([
            np.ones(len(users)),
            np.zeros(len(users)*neg_ratio)
        ])
        
        indices = np.random.permutation(len(all_users))
        for i in range(0, len(indices), batch_size):
            batch_idx = indices[i:i+batch_size]
            yield (
                torch.LongTensor(all_users[batch_idx]).to(device),
                torch.LongTensor(all_items[batch_idx]).to(device),
                torch.FloatTensor(labels[batch_idx]).to(device)
            )
    
    # Обучение с ранней остановкой
    best_auc = 0
    patience = 5
    no_improve = 0
    
    for epoch in tqdm(range(epochs)):
        model.train()
        total_loss = 0
        
        for users, items, labels in create_batch(train_data):
            optimizer.zero_grad()
            preds = model(edge_index)[users, items]
            loss = criterion(preds, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Оценка
        if epoch % 5 == 0:
            model.eval()
            with torch.no_grad():
                preds = model(edge_index).cpu()
                
                test_users = test_data['user_idx'].unique()
                if len(test_users) == 0:
                    continue
                    
                auc_scores = []
                for user in test_users:
                    pos_items = test_data[test_data['user_idx'] == user]['item_idx'].values
                    if len(pos_items) == 0:
                        continue
                        
                    neg_items = np.random.choice(num_items, min(99, num_items), replace=False)
                    all_items = np.concatenate([pos_items, neg_items])
                    labels = np.concatenate([np.ones(len(pos_items)), np.zeros(len(neg_items))])
                    
                    user_preds = preds[user, all_items].numpy()
                    auc_scores.append(roc_auc_score(labels, user_preds))
                
                current_auc = np.mean(auc_scores) if auc_scores else 0.0
                if current_auc > best_auc:
                    best_auc = current_auc
                    no_improve = 0
                    torch.save(model.state_dict(), 'best_gat_model.pth')
                else:
                    no_improve += 1
                    if no_improve >= patience:
                        print("Early stopping triggered")
                        break
                
                print(f'Epoch {epoch}: Loss = {total_loss:.4f}, Test AUC = {current_auc:.4f}')
    
    model.load_state_dict(torch.load('best_gat_model.pth'))
    model.edge_index = edge_index
    return model

def generate_recommendations(model: nn.Module, 
                           user_encoder: LabelEncoder, 
                           item_encoder: LabelEncoder, 
                           k: int = 10) -> pd.DataFrame:
    """Генерация рекомендаций (аналогично предыдущим реализациям)"""
    model.eval()
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    device = next(model.parameters()).device
    with torch.no_grad():
        preds = model(model.edge_index).cpu().numpy()
    
    recommendations = []
    for user_idx in range(num_users):
        user_id = user_encoder.inverse_transform([user_idx])[0]
        scores = preds[user_idx]
        top_items = np.argsort(-scores)[:k]
        
        for item_idx in top_items:
            item_id = item_encoder.inverse_transform([item_idx])[0]
            recommendations.append({
                'user_id': user_id,
                'item_id': item_id,
                'score': scores[item_idx]
            })
    
    return pd.DataFrame(recommendations)

# Полный пайплайн GAT
if __name__ == "__main__":
    # Подготовка данных
    train_data, test_data, user_encoder, item_encoder = prepare_data(filtered_transactions)
    
    num_users = len(user_encoder.classes_)
    num_items = len(item_encoder.classes_)
    
    # Обучение GAT
    model = train_gat(train_data, num_users, num_items, epochs=30)
    
    # Генерация рекомендаций
    recommendations = generate_recommendations(model, user_encoder, item_encoder)
    
    # Расчет метрик
    metrics = RecommendationMetricsDf(recommendations, test_data, USER_ID, ITEM_ID)
    print(f"Precision@{N}: {metrics.precision_at_k(k=N):.4f}")
    print(f"Recall@{N}: {metrics.recall_at_k(k=N):.4f}")
    print(f"MAP@{N}: {metrics.map_at_k(k=N):.4f}")
    print(f"NDCG@{N}: {metrics.ndcg_at_k(k=N):.4f}")
    print(f"Coverage: {metrics.coverage(len(filtered_articles)):.2%}")

  3%|▎         | 1/30 [00:00<00:07,  4.10it/s]

Epoch 0: Loss = 2.6904, Test AUC = 0.6695


 20%|██        | 6/30 [00:00<00:02, 11.60it/s]

Epoch 5: Loss = 2.6025, Test AUC = 0.6757


 50%|█████     | 15/30 [00:00<00:00, 18.19it/s]

Epoch 10: Loss = 2.4825, Test AUC = 0.6854


 60%|██████    | 18/30 [00:01<00:00, 15.16it/s]

Epoch 15: Loss = 2.4573, Test AUC = 0.7044


 70%|███████   | 21/30 [00:01<00:00, 13.63it/s]

Epoch 20: Loss = 2.3598, Test AUC = 0.7115


100%|██████████| 30/30 [00:01<00:00, 15.61it/s]

Epoch 25: Loss = 2.3776, Test AUC = 0.7163





Precision@10: 0.0360
Recall@10: 0.3490
MAP@10: 0.2712
NDCG@10: 0.2976
Coverage: 38.89%
