# Локальная валидация метрики

In [104]:
# Пример использования подхода из бейзлайна для тестирования модели и 
# расчета метрики через деление hist_data на трейн и валидацию

import gc
import numpy as np
import pandas as pd
from collections import Counter
import json

train = pd.read_csv('train.csv')
val_shown = pd.read_csv('val_shown.csv')
val_hidden = pd.read_csv('val_hidden.csv')

with open('class_rec_hist.json', 'r') as f:
    group_most_freq_dict = json.load(f)
user_class_df_train = pd.read_csv('user_class_df_all_data.csv')


In [105]:
def get_unique_recs(recs: list, basket: list, top_n: int) -> list:
    rec_dict = {}
    counter = 0
    for k, v in recs:
        if k not in rec_dict and k not in basket:
            rec_dict[k] = v
            counter += 1
        if counter == top_n:
            break
    return list(rec_dict.keys())

def rec_by_item(item_id: int, most_freq_dict: dict) -> list:
    return most_freq_dict.get(item_id, None)

def group_rec_by_item(item_id: int, group_most_freq_dict: dict, group_id: str) -> list:
    if group_most_freq_dict.get(str(group_id)) is not None:
        return group_most_freq_dict.get(str(group_id)).get(str(item_id))
    else:
        return None

# для каждого item_id соберем top_n самых часто встречающихся item_id, отсортируем по частоте и выберем уникальные
def rec_by_basket(basket: list, group_id: str, most_freq_dict: dict,
                  group_most_freq_dict: dict, top_n: int = 20) -> list:
    res = []
    for item in basket:
        recs = rec_by_item(item, most_freq_dict)
        group_recs = group_rec_by_item(item, group_most_freq_dict, group_id)
        if group_recs is not None:
            res += [(i[0], float(i[1])*1.2) for i in group_recs]
        if recs is not None:
            res += recs

    res = sorted(res, key=lambda x: x[1], reverse=True)
    return get_unique_recs(res, basket, top_n)


# метрики оцениваются для вектора релевантности. пример:
# реальные item_id, которые приобрел покупатель: [1 ,4, 5, 69]
# рекомендованные алгоритмом item_id: [4, 6, 7, 8, 1, 2, 67, 90]
# тогда вектор релеватности будет выглядеть следующим образом: [1, 0, 0, 0, 1, 0, 0, 0]
# и уже по не му будет расчитываться ndcg
def dcg(
    y_relevance: np.ndarray
) -> float:
    return np.sum([(2**i - 1) / np.log2(k + 1) for (k, i) in enumerate(y_relevance, start=1)])

def ndcg(
    y_relevance: np.ndarray,
    k: int
) -> float:
    if y_relevance.sum() == 0:
        return 0.0
    DCG = dcg(y_relevance[:k])
    IDCG = dcg(-np.sort(-y_relevance)[:k])
    return DCG / IDCG

def apply_relevance(x):
    return [int(item in x['hidden_basket']) for item in x['preds']]

def create_relevance(pred):
    d = pred.copy()
    d['hidden_basket'] = d['hidden_basket'].apply(set)
    d = d.apply(apply_relevance, axis=1)
    return d

def ndcg_full_dataset(d):
    dd = pd.DataFrame(d.to_list()).fillna(0).to_numpy()
    k = dd.shape[1]
    scores = [ndcg(dd[i], k) for i in range(len(dd))]
    return np.mean(scores)

def compute_ndcg_score(pred):
    relevance = create_relevance(pred)
    return ndcg_full_dataset(relevance)

def make_coocurs_dict(train_data):
    tmp = (
        train_data[['item_id', 'pav_order_id']]
        .sort_values(['item_id', 'pav_order_id'])
        .merge(train_data[['item_id', 'pav_order_id']], how='left', on=['pav_order_id'], suffixes=('', '_left'))
    )
    tmp = tmp[tmp['item_id'] != tmp['item_id_left']].copy()
    tmp1 = tmp.groupby(['item_id'])['item_id_left'].agg(lambda x: Counter(x).most_common(10))
    tmp2 = train_data.groupby(['item_id'])['pav_order_id'].count().reset_index()
    base = dict(zip(tmp2.item_id, tmp2.pav_order_id))

    most_freq_dict = {k: [(x[0], (x[1]+0.01)/(10+base[k])) for x in v] for (k, v) in tmp1.iteritems()}

    del tmp1, tmp
    gc.collect()
    return most_freq_dict

def create_basket(test_data):
    basket = test_data.groupby(['pav_order_id'])['item_id'].agg([('basket', list)])
    return basket

def create_basket_with_hidden(test_data_shown, test_data_hidden, user_class_df_train):
    basket = val_shown.groupby(['pav_order_id'])['item_id'].agg([('basket', list)]) \
         .merge(val_shown[['buyer_id', 'pav_order_id']],
         on='pav_order_id', how='left').merge(user_class_df_train, on='buyer_id', how='left')\
         [['pav_order_id', 'basket', 'group_id']]
    hidden = test_data_hidden.groupby(['pav_order_id'])['item_id'].agg([('hidden_basket', list)])\
            .merge(val_shown[['buyer_id', 'pav_order_id']],
            on='pav_order_id', how='left').merge(user_class_df_train, on='buyer_id', how='left')\
            [['pav_order_id', 'hidden_basket', 'group_id']]
    basket['group_id'].fillna(-1, inplace=True)
    
    basket['hidden_basket'] = hidden['hidden_basket']
    return basket

def make_predictions(test_data_shown, test_data_hidden, most_freq_dict, group_most_freq_dict, user_class_df_train):
    pred = create_basket_with_hidden(test_data_shown, test_data_hidden, user_class_df_train)
    preds_arr = []
    for index, row in pred.iterrows():
        preds_arr.append(rec_by_basket(row['basket'], int(row['group_id']), most_freq_dict=most_freq_dict, group_most_freq_dict=group_most_freq_dict))
    pred['preds'] = preds_arr
    return pred

# Расчет для теста

In [106]:
hist_data = pd.read_csv('hist_data.csv')

# соберем словарь встречаемостей - какие item_id покупались чаще с 
# каждым item_id 
tmp = (
    hist_data[['item_id', 'pav_order_id']]
    .sort_values(['item_id', 'pav_order_id'])
    .merge(hist_data[['item_id', 'pav_order_id']], how='left', on=['pav_order_id'], suffixes=('', '_left'))
)
tmp = tmp[tmp['item_id'] != tmp['item_id_left']].copy()
tmp1 = tmp.groupby(['item_id'])['item_id_left'].agg(lambda x: Counter(x).most_common(10))
tmp2 = hist_data.groupby(['item_id'])['pav_order_id'].count().reset_index()
base = dict(zip(tmp2.item_id, tmp2.pav_order_id))

most_freq_dict = {k: [(x[0], (x[1]+0.1)/(10+base[k])) for x in v] for (k, v) in tmp1.iteritems()}

del tmp1, tmp
gc.collect()

test = pd.read_csv('test.csv')
test1 = test[['buyer_id','pav_order_id']]
test1 = test1.drop_duplicates()
pred = test.groupby(['pav_order_id'])['item_id'].agg([('basket', list)])\
                .merge(test1, on='pav_order_id', how='left').merge(user_class_df_train, on='buyer_id', how='left')\
                 [['pav_order_id', 'basket', 'group_id']]


pred['group_id'].fillna(-1, inplace=True)

preds_arr = []
for index, row in pred.iterrows():
    preds_arr.append(rec_by_basket(row['basket'], int(row['group_id']), most_freq_dict=most_freq_dict, group_most_freq_dict=group_most_freq_dict))

pred['preds'] = preds_arr

                

# pred['preds'].to_csv('pred.csv')

In [107]:
pred.set_index('pav_order_id', inplace=True)

In [108]:
pred['preds'].to_csv('groups_testv-0-cats.csv')

In [109]:
pred

Unnamed: 0_level_0,basket,group_id,preds
pav_order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4620121489,"[203164283, 204043498, 204146308, 204119602, 2...",-1.0,"[203566491, 202820148, 202872237, 203566490, 2..."
4620121505,"[202819114, 204074914, 202822471, 202880254, 2...",-1.0,"[202820148, 202872237, 203068900, 202908280, 2..."
4620121594,"[202818687, 203430473, 204016498, 203017711, 2...",-1.0,"[202820148, 202872237, 203059303, 203430569, 2..."
4620121684,"[203338264, 203436378, 203433668, 202812161, 2...",-1.0,"[202812162, 202820148, 202872237, 203041368, 2..."
4620121902,"[205768202, 202811971, 203429467, 204393593, 2...",-1.0,"[202820148, 203422957, 202872237, 203431923, 2..."
...,...,...,...
98521278256,"[203428006, 202856200, 203406593, 203390294, 2...",6.0,"[202820148, 202807418, 203422957, 203566125, 2..."
98521278378,"[202966002, 204090577, 203403030, 204113387, 2...",3.0,"[203108045, 203429666, 203446975, 204118167, 2..."
98521278699,"[202808263, 203473696, 203476300, 203497376, 2...",2.0,"[202820148, 202809628, 203529288, 202807060, 2..."
98521278890,"[203499550, 202820398, 202991954, 203499548, 2...",6.0,"[204002808, 202991950, 202991955, 202872237, 2..."
