# L3

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/filtered_data', index_col='Unnamed: 0')
df.drop('count_', axis=1, inplace=True)

df.order_ts = pd.to_datetime(df['order_ts'])
df.sort_values(['user_id', 'order_ts'], inplace=True)

  mask |= (ar1 == a)


In [3]:
def train_val_test_split(df: pd.DataFrame):
    """
    Сплит данных на train/val/test выборки, из юзеров имеющих
    25 и более покупок отбираем 10 последних покупок у каждого для
    тестовой выборки и 10 предпоследних для валидационной

    Args:
        df::pd.DataFrame

    Returns:
        train_df::pd.DataFrame
            Обучающая выборка
        val_df::pd.DataFrame
            Валидационная выборка
        test_df::pd.DataFrame
            Тестовая выборка
    """
    # Отфильтруем юзеров с менее чем 25 покупками
    user_counts = df.groupby('user_id').count()['item_id']
    # Создадим список из их id
    valid_user_ids = user_counts[user_counts >= 25].index.to_list()

    val_rows = []
    test_rows = []
    counter = 0
    
    # Проходим по каждому юзеру из отобранного списка 
    for uid in valid_user_ids:
        user_data = df[df['user_id'] == uid]
        test_rows.append(user_data.tail(10))
        val_rows.append(user_data.iloc[-21:-11])
        counter += 1
        if counter % 10000 == 0:
            print(f'{len(valid_user_ids)-counter} users remaining')

    val_df = pd.concat(val_rows)
    test_df = pd.concat(test_rows)
    
    train_df = df.drop(val_df.index)
    train_df = train_df.drop(test_df.index)

    return train_df, val_df, test_df

In [4]:
try:
    train_df = pd.read_csv('data/train_raw.csv', index_col='Unnamed: 0')
    val_df = pd.read_csv('data/val_raw.csv', index_col='Unnamed: 0')
    test_df = pd.read_csv('data/test_raw.csv', index_col='Unnamed: 0')
except:
    pass

In [7]:
train_df, val_df, test_df = train_val_test_split(df)

val_df.to_csv('data/val_raw.csv')
test_df.to_csv('data/test_raw.csv')
train_df.to_csv('data/train_raw.csv')

243620 users remaining
233620 users remaining
223620 users remaining
213620 users remaining
203620 users remaining
193620 users remaining
183620 users remaining
173620 users remaining
163620 users remaining
153620 users remaining
143620 users remaining
133620 users remaining
123620 users remaining
113620 users remaining
103620 users remaining
93620 users remaining
83620 users remaining
73620 users remaining
63620 users remaining
53620 users remaining
43620 users remaining
33620 users remaining
23620 users remaining
13620 users remaining
3620 users remaining


In [5]:
train_df.shape, val_df.shape, test_df.shape

((14185408, 3), (2536200, 3), (2536200, 3))

In [6]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import ndcg_score

In [7]:
def calculate_metrics(predictions: dict, 
                      ground_truth: dict, 
                      k=10):
    """
    Считаем метрики
    
    Args:
        predicitions::dict
            Предсказанное ранжирование для каждого юзера
        predicitions::dict
            Действительное ранжирование взятое из истории покупок
        k::int
            Число k для метрик
    Returns:
        metrics::dict
            Словарь со всеми метриками
    """
    precision_at_k = []
    recall_at_k = []
    ndcg_at_k = []
    hit_at_k = []

    for user_id, pred_items in predictions.items():
        true_items = ground_truth.get(user_id, [])
        if len(true_items) < k:
            print(f'Invalid ranking for user {user_id}')
            continue

        # Precision@k 
        precision = precision_score([1 if item in true_items else 0 for item in pred_items[:k]], 
                                    [1] * k,
                                    average='binary')
        precision_at_k.append(precision)

        # Recall@k
        relevant_items = set(true_items)
        retrieved_items = set(pred_items[:k])
        recall = len(relevant_items.intersection(retrieved_items)) / len(relevant_items)
        recall_at_k.append(recall)
        
        # NDCG@k
        ndcg = ndcg_score([true_items], [pred_items[:k]])
        ndcg_at_k.append(ndcg)

        # Hit@k
        hit = 1 if len(set(pred_items[:k]).intersection(set(true_items))) > 0 else 0
        hit_at_k.append(hit)

    metrics = {
        'precision@10': np.mean(precision_at_k),
        'recall@10': np.mean(recall_at_k),
        'ndcg@10': np.mean(ndcg_at_k),
        'hit@10': np.mean(hit_at_k)
    }

    return metrics

In [8]:
# 2 заменены местами, 1 заменян на неверный
gt = {23: [14, 5, 84, 32, 6, 21, 27, 12, 34, 52]}
preds = {23: [14, 5, 32, 84, 6, 21, 27, 12, 34, 100]}

print(calculate_metrics(preds, gt))

# Все мимо
preds = {23: [0, 1, 2, 3, 4, 41, 42, 43, 44, 45]}

print(calculate_metrics(preds, gt))

# Полное совпадение
preds = {23: [14, 5, 84, 32, 6, 21, 27, 12, 34, 52]}

print(calculate_metrics(preds, gt))

# 2 соседние пары id заменеы между собой
preds = {23: [5, 14, 32, 84, 6, 21, 27, 12, 34, 52]}

print(calculate_metrics(preds, gt))

{'precision@10': 0.9, 'recall@10': 0.9, 'ndcg@10': 0.874599232477886, 'hit@10': 1.0}
{'precision@10': 0.0, 'recall@10': 0.0, 'ndcg@10': 0.8128852131649043, 'hit@10': 0.0}
{'precision@10': 1.0, 'recall@10': 1.0, 'ndcg@10': 1.0000000000000002, 'hit@10': 1.0}
{'precision@10': 1.0, 'recall@10': 1.0, 'ndcg@10': 0.8306997855261401, 'hit@10': 1.0}
