## Бейслайны для сравнения

In [1]:
import pandas as pd
import numpy as np
import implicit
import torch
from tqdm import tqdm
from scipy import sparse
from sklearn.metrics import ndcg_score
from torchmetrics import RetrievalHitRate, RetrievalNormalizedDCG, RetrievalMAP, RetrievalMRR

In [2]:
df = pd.read_csv('data/filtered_data.csv', index_col='Unnamed: 0')
df.drop('count_', axis=1, inplace=True)

  mask |= (ar1 == a)


In [3]:
df = df.sort_values(['user_id', 'order_ts'])

valid_users = df['user_id'].value_counts().loc[lambda x: x >= 3].index

test_df = df[df['user_id'].isin(valid_users)].groupby('user_id').tail(1)

train_df = df[~df.index.isin(test_df.index)]

print('Train set size:', len(train_df))
print('Test set size:', len(test_df))

Train set size: 17908704
Test set size: 853125


## BERT

In [4]:
data = {'method': ['BERT'],
    'hr@10': [0.309],
    'ndcg@10': [0.186],
    'mrr@10': [0.169]}

results = pd.DataFrame(data)

## POP

In [5]:
def pop_baseline(df: pd.DataFrame, k=10):
    return df.groupby('item_id').size().sort_values(ascending=False).head(k).index.to_list()

In [6]:
recommended_items = pop_baseline(train_df)
num_users = 100000
# num_users= len(test_df['user_id'].unique())
hr_sum = 0
ndcg_sum = 0
mrr_sum = 0

pbar = tqdm(test_df['user_id'].unique()[:num_users])
for user_id in pbar:
    last_bought_item = test_df.loc[test_df['user_id'] == user_id, 'item_id'].values[0]

    if last_bought_item in recommended_items:
        hr_sum += 1

    gain = [1 if item == last_bought_item else 0 for item in recommended_items]
    ndcg = ndcg_score([gain], [gain], k=10)
    ndcg_sum += ndcg

    rank = np.where(np.array(recommended_items) == last_bought_item)[0]
    if len(rank) > 0:
        mrr_sum += 1 / (rank[0] + 1)

    pbar.set_description(f"HR: {hr_sum / (user_id + 1):.4f} | NDCG: {ndcg_sum / (user_id + 1):.4f} | MRR: {mrr_sum / (user_id + 1):.4f}")

hr_at_10 = hr_sum / num_users
ndcg_at_10 = ndcg_sum / num_users
mrr_at_10 = mrr_sum / num_users

HR: 0.0988 | NDCG: 0.0988 | MRR: 0.0350: 100%|██████████| 100000/100000 [02:19<00:00, 717.43it/s]


In [7]:
results = results.append({
    'method': 'POP',
    'hr@10': hr_at_10,
    'ndcg@10': ndcg_at_10,
    'mrr@10': mrr_at_10
}, ignore_index=True)

## IALS

In [8]:
train_matrix = sparse.coo_matrix((train_df['item_id'].astype(float), (train_df['user_id'], train_df['item_id'])))
test_matrix = sparse.coo_matrix((test_df['item_id'].astype(float), (test_df['user_id'], test_df['item_id'])))

In [9]:
num_factors = 250
regularization = 0.01
iterations = 20

model = implicit.als.AlternatingLeastSquares(factors=num_factors)

model.fit(train_matrix)



  0%|          | 0/15 [00:00<?, ?it/s]

In [10]:
num_users = 100000
hit_count = 0
ndcg = 0.0
mrr = 0.0

unique_user_ids = np.unique(test_matrix.row)[:num_users]

pbar = tqdm(unique_user_ids[:num_users], desc="ALS метрики")
for user_id in pbar:
    ground_truth_items = test_matrix.col[test_matrix.row == user_id]
    user_recommendations, _ = model.recommend(user_id, train_matrix, N=10, 
                                              filter_already_liked_items=False)
    
    # HR
    if len(set(user_recommendations) & set(ground_truth_items)) > 0:
        hit_count += 1
    
    # NDCG
    dcg = 0.0
    idcg = np.sum(1 / np.log2(np.arange(2, len(ground_truth_items) + 2)))
    
    for rank, item in enumerate(user_recommendations):
        if item in ground_truth_items:
            relevance = 1
            dcg += 1 / np.log2(rank + 2)
    if idcg > 0.0:
        ndcg += dcg / idcg
    
    # MRR
    for rank, item in enumerate(user_recommendations):
        if item in ground_truth_items:
            mrr += 1.0 / (rank + 1)
            break
    pbar.set_description(f"HR: {hit_count / (user_id + 1):.4f} | NDCG: {ndcg / (user_id + 1):.4f} | MRR: {mrr / (user_id + 1):.4f}")

num_users = len(unique_user_ids)
hit_rate = hit_count / float(num_users)
ndcg /= float(num_users)
mrr /= float(num_users)

# Print the results
print(f"HR@10: {hit_rate}")
print(f"NDCG@10: {ndcg}")
print(f"MRR@10: {mrr}")


HR: 0.1560 | NDCG: 0.0871 | MRR: 0.0659: 100%|██████████| 100000/100000 [02:18<00:00, 721.00it/s]

HR@10: 0.19364
NDCG@10: 0.108086046366437
MRR@10: 0.08186239682539552





In [11]:
results = results.append({
    'method': 'ALS',
    'hr@10': hit_rate,
    'ndcg@10': ndcg,
    'mrr@10': mrr
}, ignore_index=True)

In [12]:
results

Unnamed: 0,method,hr@10,ndcg@10,mrr@10
0,BERT,0.309,0.186,0.169
1,POP,0.12271,0.12271,0.043414
2,ALS,0.19364,0.108086,0.081862
