In [2]:
from collections import defaultdict
from typing import List, Dict, Tuple

def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    There is a typographical error on the formula referenced in the original definition of this function:
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    log2(i) should be log2(i+1)

    The formulas here are derived from
    https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Discounted_Cumulative_Gain

    The formulas return the same results when r contains only binary values

    >>> r = [3, 2, 3, 0, 1, 2]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    7.0
    >>> print(round(dcg_at_k(r, 2), 16))
    4.2618595071429155
    >>> print(round(dcg_at_k(r, 2, method=1), 16))
    8.892789260714373
    >>> print(round(dcg_at_k(r, 6), 16))
    6.861126688593502
    >>> print(round(dcg_at_k(r, 6, method=1), 16))
    13.848263629272981

    Args:
        r: Relevance scores (list or numpy array) in rank order
            (first element is the most relevant item)
        k: Number of results to consider
        method: If 0 then sum rel_i / log2(i + 1) [not log2(i)]
                If 1 then sum (2^rel_i - 1) / log2(i + 1)

    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        elif method == 1:
            return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    >>> r = [3, 2, 3, 0, 1, 2]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> ndcg_at_k(r, 1, method=1)
    1.0
    >>> print(round(ndcg_at_k(r, 2), 16))
    0.8710490642551529
    >>> print(round(ndcg_at_k(r, 2, method=1), 16))
    0.7789412530088334
    >>> print(round(ndcg_at_k(r, 6), 16))
    0.9608081943360616
    >>> print(round(ndcg_at_k(r, 6, method=1), 16))
    0.9488107485678984

    Args:
        r: Relevance scores (list or numpy array) in rank order
            (first element is the most relevant item)
        k: Number of results to consider
        method: If 0 then sum rel_i / log2(i + 1) [not log2(i)]
                If 1 then sum (2^rel_i - 1) / log2(i + 1)

    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

def precision_at_k(r, k):
    """Score is precision @ k
    Relevance is binary (nonzero is relevant).
    >>> r = [0, 0, 1]
    >>> precision_at_k(r, 1)
    0.0
    >>> precision_at_k(r, 2)
    0.0
    >>> print(round(precision_at_k(r, 3), 16))
    0.3333333333333333
    >>> precision_at_k(r, 4)
    Traceback (most recent call last):
        File "<stdin>", line 1, in ?
    ValueError: Relevance score length < k

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Precision @ k
    Raises:
        ValueError: len(r) must be >= k
    """
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def average_precision(r):
    """Score is average precision (area under PR curve)
    Relevance is binary (nonzero is relevant).
    >>> r = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1]
    >>> delta_r = 1. / len(r)
    >>> print(round(sum([sum(r[:x + 1]) / (x + 1.) * delta_r for x, y in enumerate(r) if y]), 16))
    0.3916666666666667
    >>> print(round(average_precision(r), 16))
    0.3916666666666666

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Average precision
    """
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.sum(out) / len(r)


def mean_average_precision(rs):
    """Score is mean average precision
    Relevance is binary (nonzero is relevant).
    >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1]]
    >>> print(round(mean_average_precision(rs), 16))
    0.3916666666666666
    >>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1], [0]]
    >>> print(round(mean_average_precision(rs), 16))
    0.1958333333333333

    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean average precision
    """
    return np.mean([average_precision(r) for r in rs])

def mean_recall(rs):
    return np.mean([(np.array(r) == 1).mean() for r in rs])


def compute_relevance_vector(actual: List[float], predicted: List[float], k: int):
    """Высчитывает вектор релевантностей длины K
    """
    relevance = []

    if (len(actual) >= len(predicted) >= k) or (
            (len(actual) >= k and len(predicted) >= k)):
        # Нормальный случай, когда K <= actual <= predicted
        # Нормальный случай, когда K <= actual and K <= predicted
        end_metric_range = k
    elif (len(actual) < len(predicted) and len(actual) < k) and len(predicted) >= k:
        # Когда просмотренный вектор меньше, чем предсказанный и @K
        end_metric_range = len(actual)
    else:
        raise ValueError('Непредвиденный случай комбинации входных данных:\n actual:{} , predicted:{} '.format(
            len(actual),
            len(predicted)
        ))

    start_metric_range = 0
    for i in range(start_metric_range, end_metric_range):
        r = 0

        p = predicted[i]

        if p in actual:
            r = 1
        relevance.append(r)

    return relevance


class Evaluator:
    def __init__(self, predict_dict: dict, actual_dict: dict):
        self.predict_dict = predict_dict
        self.actual_dict = actual_dict

    def full_relevance_dict(self, val_user_uids: list, k: int, nonpredicted: bool) -> Dict[int, List[int]]:
        relevance_dict = {}

        a_a = 0
        p_a = 0
        p_p = 0

        for user_uid in tqdm(list(val_user_uids)):

            predicted = self.predict_dict.get(user_uid)
            actual = self.actual_dict.get(user_uid)
            # print(predicted, actual)
            # print()

            # relevance = compute_relevance_vector(actual, predicted, k)
            # print(user_uid)
            # print(len(predicted), len(actual))
            # print()
            if actual is None or len(actual) == 0:
                # print('Просмотры для user_uid={} отсутствует'.format(user_uid))
                a_a += 1
                continue
            elif predicted is None:
                p_a += 1

                if nonpredicted:
                    relevance = [0] * len(actual)
                else:
                    continue
                # raise ValueError('Предсказание для user_uid={} отсутствует'.format(user_uid))
                # continue
            else:
                try:
                    relevance = compute_relevance_vector(actual, predicted, k)
                    p_p += 1
                except Exception as e:
                    print(e)
                    continue
            relevance_dict[user_uid] = relevance

        print('Просмотры отсутствуют: {}, '
              'Предсказание отсутствует: {}, '
              'Предсказано: {}'
              ''.format(a_a, p_a, p_p))

        return relevance_dict

    def relevance_list(self, val_user_uids: list, k: int, nonpredicted: bool) -> List[List[int]]:
        relevance_list = []
        relevance_dict = self.full_relevance_dict(val_user_uids, k, nonpredicted)

        for user_uid, relevance in relevance_dict.items():
            relevance_list.append(relevance)

        return relevance_list


def get_metrics_dict(
        predict_dict: Dict,
        actual_dict: Dict,
        current_predict_user_uids: List[int],
        k: int,
        nonpredicted: bool
) -> Tuple[Dict, Dict]:
    metrics_dict = defaultdict(lambda: defaultdict(list))

    e = Evaluator(predict_dict, actual_dict)
    relevance_list = e.relevance_list(current_predict_user_uids, k=k, nonpredicted=nonpredicted)

    metrics_dict['mean_recall'] = mean_recall(relevance_list)
    metrics_dict['map'] = mean_average_precision(relevance_list)
    metrics_dict['mean_ndcg'] = (
        np.mean([ndcg_at_k(r, min(len(r), k)) for r in relevance_list])
    )
    metrics_dict = dict(metrics_dict)

    ideal_metrics_dict = defaultdict(lambda: defaultdict(list))
    # Идеальное ранжирование
    new_relevance_list = []
    for rel in relevance_list:
        try:
            if sum(rel) > 0:
                len_rel = len(rel)
                sum_rel = sum(rel)
                rel = [1] * sum_rel
                rel.extend([0] * (len_rel - sum_rel))
            new_relevance_list.append(rel)
        except Exception:
            pass
        new_relevance_list.append(rel)
    relevance_list = new_relevance_list

    ideal_metrics_dict['mean_recall'] = mean_recall(relevance_list)
    ideal_metrics_dict['map'] = mean_average_precision(relevance_list)
    ideal_metrics_dict['mean_ndcg'] = (
        np.mean([ndcg_at_k(r, min(len(r), k)) for r in relevance_list])
    )
    ideal_metrics_dict = dict(ideal_metrics_dict)

    return metrics_dict, ideal_metrics_dict

In [3]:
import json
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'

from implicit.nearest_neighbours import ItemItemRecommender, BM25Recommender, TFIDFRecommender, bm25_weight
import numpy as np
import pandas as pd
import scipy.sparse as sp
from tqdm.auto import tqdm

RAW_DATA_PATH = './raw_data/'
DATA_PATH = './data'
RANDOM_STATE = 42
TOP_N = 100

# Данные

In [4]:
train_clicks = pd.read_hdf(os.path.join(DATA_PATH, 'step1_enriched_train_clicks.h5'), index=None, key='step1')
train_likes = pd.read_hdf(os.path.join(DATA_PATH, 'step1_enriched_train_likes.h5'), index=None, key='step1')
train_shares = pd.read_hdf(os.path.join(DATA_PATH, 'step1_enriched_train_shares.h5'), index=None, key='step1')

In [5]:
val_clicks = pd.read_hdf(os.path.join(DATA_PATH, 'step1_val_clicks.h5'), index=None, key='step1')

In [6]:
with open(os.path.join(DATA_PATH, 'val1_users.json'), 'r') as f:
    val1_users = set(json.load(f)['users'])
    
with open(os.path.join(DATA_PATH, 'val2_users.json'), 'r') as f:
    val2_users = set(json.load(f)['users'])
    
val_users = val1_users | val2_users

In [7]:
user_mapping = pd.read_csv(os.path.join(DATA_PATH, 'step0_user_mapping.csv'))
picture_mapping = pd.read_csv(os.path.join(DATA_PATH, 'step0_picture_mapping.csv'))

user_mapping_dict = user_mapping.groupby('old')['new'].first().to_dict()
picture_mapping_dict = picture_mapping.groupby('old')['new'].first().to_dict()
inv_picture_mapping_dict = dict([(v, k)for k, v in picture_mapping_dict.items()])

In [8]:
def filter_users_pictures(_df):
    df = _df.copy()
    df_user_actions_count = df.groupby('user_id')['picture_id'].count()
    selected_users = df_user_actions_count[df_user_actions_count > 1].index.tolist()
    df = df[df['user_id'].isin(selected_users)]

    df_picture_actions_count = df.groupby('picture_id')['user_id'].count()
    selected_pictures = df_picture_actions_count[df_picture_actions_count > 1].index.tolist()
    df = df[df['picture_id'].isin(selected_pictures)]
    
    return df

In [9]:
# cutted_train_clicks = train_clicks.copy()
# cutted_train_clicks = filter_users_pictures(cutted_train_clicks)
# cutted_train_clicks.shape

selected_train_clics = train_clicks

user_ids = selected_train_clics['user_id'].map(user_mapping.set_index('old').new)
picture_ids = selected_train_clics['picture_id'].map(picture_mapping.set_index('old').new)

train_picture_user_clicks_matrix = sp.csr_matrix(
    (np.tile(1, selected_train_clics.shape[0]),
        (
            picture_ids,
            user_ids
        )
    ),
#     shape=(picture_ids.max() + 1, user_ids.max() + 1),
    shape=(len(picture_mapping_dict) + 1, len(user_mapping_dict) + 1),
    dtype=np.float
)
train_picture_user_clicks_matrix.shape

(1497835, 446189)

# Evaluate

In [10]:
val_one_true = (
    val_clicks[val_clicks['user_id'].isin(val1_users)].groupby('user_id')['picture_id'].apply(list).to_dict()
)

### Popular

In [11]:
val_one_popular = (
    val_clicks[val_clicks['user_id'].isin(val1_users)]
    .groupby('picture_id')['user_id']
    .count()
    .sort_values(ascending=False)[:TOP_N]
).index.to_list()

In [12]:
val_one_pred_popular = dict()
for user_id in list(val1_users):
    _pred = val_one_popular.copy()
    val_one_pred_popular[user_id] = _pred

In [13]:
get_metrics_dict(
    predict_dict=val_one_pred_popular, 
    actual_dict=val_one_true, 
    current_predict_user_uids=val1_users,
    k=TOP_N,
    nonpredicted=False
)

HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))


Просмотры отсутствуют: 0, Предсказание отсутствует: 0, Предсказано: 8275


({'map': 2.0391888025618255e-05,
  'mean_ndcg': 0.0005757853858061449,
  'mean_recall': 9.07763178511357e-05},
 {'map': 9.07763178511357e-05,
  'mean_ndcg': 0.0013293051359516616,
  'mean_recall': 9.07763178511357e-05})

### Item2Item

In [14]:
model = ItemItemRecommender(K=200)
model.fit(train_picture_user_clicks_matrix)

100%|██████████| 1497835/1497835 [00:09<00:00, 155892.11it/s]


In [15]:
val_one_pred_item2item = dict()
for user_id in list(val1_users):
    user_index = user_mapping_dict[user_id]
    _pred = model.recommend(user_index, train_picture_user_clicks_matrix.T)
    _pred = [inv_picture_mapping_dict[x[0]] for x in _pred]
    _pred.extend(val_one_popular[:TOP_N - len(_pred)])
    
    val_one_pred_item2item[user_id] = _pred

In [16]:
get_metrics_dict(
    predict_dict=val_one_pred_item2item, 
    actual_dict=val_one_true, 
    current_predict_user_uids=val1_users,
    k=TOP_N,
    nonpredicted=False
)

HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))


Просмотры отсутствуют: 0, Предсказание отсутствует: 0, Предсказано: 8275


({'map': 6.347410050260148e-06,
  'mean_ndcg': 0.000265134597088794,
  'mean_recall': 4.487928101754297e-05},
 {'map': 4.487928101754297e-05,
  'mean_ndcg': 0.0008459214501510574,
  'mean_recall': 4.487928101754297e-05})

### ALS

In [27]:
from implicit.als import AlternatingLeastSquares

np.random.seed(RANDOM_STATE)
# als_model = AlternatingLeastSquares(factors=64, iterations=100, use_gpu=False, regularization=0.01)
als_model = AlternatingLeastSquares(factors=192, iterations=100, use_gpu=True, regularization=0.01)
als_model.fit(train_picture_user_clicks_matrix)

100%|██████████| 100.0/100 [01:32<00:00,  1.16it/s]


In [28]:
print(
    len(val1_users & set(train_clicks['user_id'].unique().tolist())) / len(val1_users),
#     len(val1_users & set(cutted_train_clicks['user_id'].unique().tolist())) / len(val1_users),
    len(val1_users & set(val_one_true.keys())) / len(val1_users),
)

0.9680966767371602 1.0


In [29]:
val_one_pred_als = dict()

for user_id in tqdm(list(val1_users)):
    user_index = user_mapping_dict[user_id]
    
    if als_model.user_factors[user_index][0] == 0:
        continue
    
    _pred = als_model.recommend(user_index, train_picture_user_clicks_matrix.T, TOP_N)
    
    _pred = [inv_picture_mapping_dict[x[0]] for x in _pred]
    
    val_one_pred_als[user_id] = _pred

HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))




In [31]:
get_metrics_dict(
    predict_dict=val_one_pred_als, 
    actual_dict=val_one_true, 
    current_predict_user_uids=val1_users,
    k=TOP_N,
    nonpredicted=False
)

HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))


Просмотры отсутствуют: 0, Предсказание отсутствует: 264, Предсказано: 8011


({'map': 0.000974841883227078,
  'mean_ndcg': 0.0035204697905248192,
  'mean_recall': 0.0015133894462884255},
 {'map': 0.0015133894462884255,
  'mean_ndcg': 0.005742104606166521,
  'mean_recall': 0.0015133894462884255})

In [None]:
# ({'map': 0.0006754847755774471,
#   'mean_ndcg': 0.002348070146136011,
#   'mean_recall': 0.0008974340168676217},
#  {'map': 0.0008974340168676216,
#   'mean_ndcg': 0.0036200224691049806,
#   'mean_recall': 0.0008974340168676216})

In [None]:
# ({'map': 0.000978976469498359,
#   'mean_ndcg': 0.00336726540704797,
#   'mean_recall': 0.001473669945827444},
#  {'map': 0.0014736699458274443,
#   'mean_ndcg': 0.005367619523155661,
#   'mean_recall': 0.0014736699458274443})

In [70]:
# import pickle

# with open(os.path.join(DATA_PATH, 'step3_clicks_als.pkl'), 'wb') as f:
#     pickle.dump(als_model, f)

# LightFM

##### Picture features

In [37]:
_dim = train_picture_user_clicks_matrix.shape[0]
# _dim = len(picture_mapping_dict)
clicks_picture_eye_features = sp.csr_matrix(
    (
            np.tile(1, _dim), 
            (
                np.arange(_dim), 
                np.arange(_dim)
            )
    ),
    shape=(_dim, _dim),
    dtype=np.float
)
clicks_picture_eye_features

<1497835x1497835 sparse matrix of type '<class 'numpy.float64'>'
	with 1497835 stored elements in Compressed Sparse Row format>

In [33]:
raw_picture_descriptions = pd.read_csv(os.path.join(RAW_DATA_PATH, 'descriptions.csv'))
raw_picture_descriptions.head()

from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(
    input='content', 
    encoding='utf-8',
    decode_error='strict', 
    strip_accents=None, 
    lowercase=True,
    preprocessor=None, 
    tokenizer=None, 
    analyzer='word',
    stop_words=None, 
    token_pattern=r"(?u)\b\w\w+\b",
    ngram_range=(1, 3), 
    max_df=1.0, 
    min_df=2,
    max_features=1000, 
    vocabulary=None, 
    binary=False,
    dtype=np.float32, 
    norm='l2', 
    use_idf=True, 
    smooth_idf=True,
    sublinear_tf=False
)
# picture_descriptions_embeds = tf.fit_transform(raw_picture_descriptions['description'])

In [34]:
_inv_picture_desc_dict = raw_picture_descriptions[['picture_id']].to_dict()['picture_id']
picture_desc_dict = dict([(v, k) for k,v in _inv_picture_desc_dict.items()])

def generate_picture_descs(pairs):
    global raw_picture_descriptions
    global picture_descriptions_embeds
    global picture_desc_dict
    
    _pairs = pairs.merge(raw_picture_descriptions, on=['picture_id'], how='left').fillna('')
    gb = _pairs.groupby('description')['description'].first()
    p_dict = dict(zip(gb.values, range(len(gb))))
    
    selected_picture_descriptions_embeds_pairs = (
        picture_descriptions_embeds[_pairs['description'].apply(lambda x: p_dict[x]).values, :]
    )
    
    return selected_picture_descriptions_embeds_pairs

In [35]:
clicks_picture_desc_features = generate_picture_descs(train_clicks)

In [39]:
clicks_picture_desc_features

<2314748x1000 sparse matrix of type '<class 'numpy.float32'>'
	with 9697719 stored elements in Compressed Sparse Row format>

In [38]:
clicks_picture_full_features = sp.hstack(
    [
        clicks_picture_eye_features,
        clicks_picture_desc_features
    ]
).tocsr()
clicks_picture_full_features

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 2314748, expected 1497835.

##### User features

In [438]:
_dim = train_picture_user_clicks_matrix.shape[1]
# _dim = len(user_mapping_dict)

clicks_user_eye_features = sp.csr_matrix(
    (
            np.tile(1, _dim), 
            (
                np.arange(_dim), 
                np.arange(_dim)
            )
    ),
    shape=(_dim ,_dim ),
    dtype=np.float
)
clicks_user_eye_features

<446189x446189 sparse matrix of type '<class 'numpy.float64'>'
	with 446189 stored elements in Compressed Sparse Row format>

In [439]:
clicks_user_full_features = sp.hstack(
    [
        clicks_user_eye_features
    ]
).tocsr()
clicks_user_full_features

<446189x446189 sparse matrix of type '<class 'numpy.float64'>'
	with 446189 stored elements in Compressed Sparse Row format>

In [457]:
from lightfm import LightFM

lightfm_model = LightFM(
    no_components=200, # 10
#     k=5,
#     n=100, # 10
    learning_schedule="adadelta",
    loss="warp",
    learning_rate=0.07,
    rho=0.95,
    epsilon=1e-6,
    item_alpha=0, # 0.0
    user_alpha=0,
    max_sampled=100, # 10
    random_state=RANDOM_STATE #
)

lightfm_model.fit(
    train_picture_user_clicks_matrix.T, 
    epochs=20, 
    num_threads=8,
    verbose=True,
    item_features=clicks_picture_full_features,
    user_features=clicks_user_full_features,
#     sample_weight=_train_sample_weight_matrix.T.tocoo()
)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


<lightfm.lightfm.LightFM at 0x7f14e2d8ba90>

In [443]:
from typing import Set

def get_lightfm_predict_dict(
        predict_user_uids: List[int],
        user_uid_to_cat: Dict[int, int],
        element_cat_to_uid: Dict[int, int],
        model: LightFM,
        item_features: sp.csr_matrix,
        user_features: sp.csr_matrix,
        filter_dict: Dict[int, Set[int]],
        top_n: int
) -> Dict[int, Tuple[List[int], List[np.float32]]]:
    # Предсказания в виде плотной матрицы
    new_predict_user_cats = []
    # Список uids, которые у нас есть в индексе ПОЛНЫХ данных обучения
    new_predict_user_uids = []
    for user_uid in predict_user_uids:
        user_cat = user_uid_to_cat.get(user_uid)
        if user_cat is not None:
            new_predict_user_uids.append(user_uid)
            new_predict_user_cats.append(user_cat)

    # Получаем вектора предсказаний для пользователя
    predict_dict = {}

#     _element_cats = np.array(range(0, len(element_cat_to_uid)))
    _element_cats = np.array(range(0, len(element_cat_to_uid) + 1))

    for user_cat, user_uid in tqdm(zip(new_predict_user_cats, new_predict_user_uids), total=len(new_predict_user_uids)):
        filter_elements = filter_dict[user_uid]
        max_needed_item_count = len(filter_elements) + top_n

        _user_cats = np.repeat(user_cat, len(_element_cats))
        _user_cats = [1]
#         _element_cats = [1, 2]
        user_vec = model.predict(
            user_ids=_user_cats,
            item_ids=_element_cats,
            item_features=item_features,
            user_features=user_features,
            num_threads=0
        )

        # Ищем наибольшие индексы с помощью argpartition
        user_unsorted_pred = np.argpartition(
            -user_vec,
            max_needed_item_count
        )[: max_needed_item_count + 1]

        # Сортируем только наибольшие индексы
        user_sorted_pred = [y[0] for y in sorted(
            list(
                zip(
                    user_unsorted_pred,
                    user_vec[user_unsorted_pred]
                )), key=lambda x: -x[1]
        )]

        user_filtered_final_pred = []
        user_filtered_final_pred_weights = []

        for element_cat in user_sorted_pred:
            if element_cat not in filter_elements:
                user_filtered_final_pred_weights.append(user_vec[element_cat])
                user_filtered_final_pred.append(element_cat_to_uid.get(element_cat))

            if len(user_filtered_final_pred) == top_n:
                break

        predict_dict[user_uid] = (user_filtered_final_pred, user_filtered_final_pred_weights)

    return predict_dict

In [458]:
from collections import defaultdict

current_predict_user_uids = val1_users

full_val_one_pred_lightfm = get_lightfm_predict_dict(
#     list(current_predict_user_uids)[:100],
    current_predict_user_uids,
    user_mapping_dict,
    inv_picture_mapping_dict,
    lightfm_model,
    item_features=clicks_picture_full_features,
    user_features=clicks_user_full_features,
    filter_dict=defaultdict(list),
    top_n=100
)

HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))

In [466]:
# Выкидываем веса для подачи в Evaluator
val_one_pred_lightfm = {}
for k, v in full_val_one_pred_lightfm.items():
    val_one_pred_lightfm[k] = v[0]

In [467]:
get_metrics_dict(
    predict_dict=val_one_pred_lightfm, 
    actual_dict=val_one_true, 
    current_predict_user_uids=val1_users,
    k=TOP_N,
    nonpredicted=False
)

HBox(children=(IntProgress(value=0, max=8275), HTML(value='')))

Просмотры отсутствуют: 0, Предсказание отсутствует: 0, Предсказано: 8275


({'map': 6.240380659604701e-05,
  'mean_ndcg': 0.000430302183004857,
  'mean_recall': 0.00015449815374286673},
 {'map': 0.00015449815374286673,
  'mean_ndcg': 0.0009667673716012085,
  'mean_recall': 0.00015449815374286673})

In [221]:
# Popular
# ({'map': 2.0391888025618255e-05,
#   'mean_ndcg': 0.0005757853858061449,
#   'mean_recall': 9.07763178511357e-05},
#  {'map': 9.07763178511357e-05,
#   'mean_ndcg': 0.0013293051359516616,
#   'mean_recall': 9.07763178511357e-05})

# Item2Item + Popular
# ({'map': 1.9532606006026568e-05,
#   'mean_ndcg': 0.0005380947885125098,
#   'mean_recall': 8.27199230877923e-05},
#  {'map': 8.27199230877923e-05,
#   'mean_ndcg': 0.0012084592145015106,
#   'mean_recall': 8.27199230877923e-05})

# ALS (GPU x32)
# ({'map': 0.00029929270789502937,
#   'mean_ndcg': 0.0013584427545974098,
#   'mean_recall': 0.000578502283034005},
#  {'map': 0.000578502283034005,
#   'mean_ndcg': 0.0027794561933534743,
#   'mean_recall': 0.000578502283034005})

# ALS (СPU x10)
# ({'map': 1.4473761406501112e-05,
#   'mean_ndcg': 0.00024672182460939323,
#   'mean_recall': 4.299331928525759e-05},
#  {'map': 4.299331928525759e-05,
#   'mean_ndcg': 0.0006042296072507553,
#   'mean_recall': 4.299331928525759e-05})

# ALS (GPU x48==x64)
# ({'map': 0.0009478596897275725,
#   'mean_ndcg': 0.0029713081448600725,
#   'mean_recall': 0.0012379757400905436},
#  {'map': 0.0012379757400905436,
#   'mean_ndcg': 0.004350453172205438,
#   'mean_recall': 0.0012379757400905436})

# ALS (GPU x64)!
# ({'map': 0.0009478596897275725,
#   'mean_ndcg': 0.0029713081448600725,
#   'mean_recall': 0.0012379757400905436},
#  {'map': 0.0012379757400905436,
#   'mean_ndcg': 0.004350453172205438,
#   'mean_recall': 0.0012379757400905436})

# ALS (CPU x64), 100 iter
# ({'map': 0.000978976469498359,
#   'mean_ndcg': 0.00336726540704797,
#   'mean_recall': 0.001473669945827444},
#  {'map': 0.0014736699458274443,
#   'mean_ndcg': 0.005367619523155661,
#   'mean_recall': 0.0014736699458274443})

# ALS (CPU x96, 30iter)
# ({'map': 0.0008789522700955309,
#   'mean_ndcg': 0.003111409247274019,
#   'mean_recall': 0.0012420469313687357},
#  {'map': 0.0012420469313687357,
#   'mean_ndcg': 0.004833836858006042,
#   'mean_recall': 0.0012420469313687357})

# ALS (GPU x96, 100iter)
# ({'map': 0.0008687408695060607,
#   'mean_ndcg': 0.003040543044411595,
#   'mean_recall': 0.0012984416947121394},
#  {'map': 0.0012984416947121394,
#   'mean_ndcg': 0.004954682779456193,
#   'mean_recall': 0.0012984416947121394})

# LightFM (Default, no_components=20, epochs=10)
# ({'map': 1.2449460992815006e-05,
#   'mean_ndcg': 0.0002660486737075211,
#   'mean_recall': 5.551605060224218e-05},
#  {'map': 5.551605060224218e-05,
#   'mean_ndcg': 0.0007250755287009063,
#   'mean_recall': 5.551605060224218e-05})

# LightFM (Default, no_components=64, epochs=20)
# ({'map': 6.240380659604701e-05,
#   'mean_ndcg': 0.000430302183004857,
#   'mean_recall': 0.00015449815374286673},
#  {'map': 0.00015449815374286673,
#   'mean_ndcg': 0.0009667673716012085,
#   'mean_recall': 0.00015449815374286673})