In [103]:
!pip install implicit
!pip install metrics-eval

import implicit
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from functools import reduce



Reading data

In [104]:
behaviors = pd.read_table('behaviors.tsv', usecols=['user_id', 'history', 'impressions'])
behaviors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73152 entries, 0 to 73151
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      73152 non-null  object
 1   history      70938 non-null  object
 2   impressions  73152 non-null  object
dtypes: object(3)
memory usage: 1.7+ MB


In [105]:
behaviors = behaviors.dropna()
behaviors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70938 entries, 0 to 73151
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      70938 non-null  object
 1   history      70938 non-null  object
 2   impressions  70938 non-null  object
dtypes: object(3)
memory usage: 2.2+ MB


In [106]:
def extract_target(news_impression, splitter='-') -> tuple:
    """
    Returns (news, target)
    """
    news, target = news_impression.split(splitter)
    return news, np.int(target)

def extract_targets(news_impressions, use_news=False, use_target=False) -> dict:
    impression_dict = dict(map(extract_target, news_impressions.split()))
    if use_news:
        return list(impression_dict.keys())
    if use_target:
        return list(impression_dict.values())

    return impression_dict

In [107]:
behaviors['history'] = behaviors.history.apply(str.split).to_list()
behaviors['impression_target']  = behaviors.impressions.apply(extract_targets)
behaviors['impression_news'] = behaviors.impressions.apply(lambda x: extract_targets(x, use_news=True))

grouped_users = behaviors.groupby('user_id')

In [108]:
tags = behaviors['history'].to_list() + behaviors['impression_news'].to_list()
tags = set([tag for user_tags in tags for tag in user_tags])

news_to_id = dict(zip(tags, range(len(tags))))
id_to_news = dict(enumerate(tags))

In [109]:
def merge_dicts(dicts):
    return reduce(lambda x, y: {**x, **y}, dicts, {})

def merge_history_tags(tags):
    dicts = list(map(lambda tag: dict(zip(tag, [1] * len(tag))), tags))
    return merge_dicts(dicts)

In [110]:
user_targets = grouped_users['impression_target'].apply(list).apply(merge_dicts)

In [111]:
user_to_id = dict(zip(user_targets.index, range(len(user_targets))))
id_to_user = dict(enumerate(user_targets.index))

In [112]:
history_targets = grouped_users['history'].apply(list).apply(merge_history_tags)
history_targets.head()

user_id
U1        {'N23571': 1, 'N58267': 1, 'N25682': 1, 'N1064...
U10       {'N27612': 1, 'N36699': 1, 'N64777': 1, 'N9120...
U10000    {'N10059': 1, 'N46978': 1, 'N53234': 1, 'N3345...
U10002    {'N27448': 1, 'N10059': 1, 'N28467': 1, 'N6477...
U10004    {'N38118': 1, 'N55805': 1, 'N43482': 1, 'N1540...
Name: history, dtype: object

In [113]:
users = pd.merge(user_targets, history_targets, on='user_id')
users['target'] = users[['history', 'impression_target']].apply(lambda x: {**x[0], **x[1]}, axis=1)
users = users.reset_index()
users.head()

Unnamed: 0,user_id,impression_target,history,target
0,U1,"{'N14637': 0, 'N20036': 1}","{'N23571': 1, 'N58267': 1, 'N25682': 1, 'N1064...","{'N23571': 1, 'N58267': 1, 'N25682': 1, 'N1064..."
1,U10,"{'N33397': 0, 'N46917': 0, 'N11930': 0, 'N5861...","{'N27612': 1, 'N36699': 1, 'N64777': 1, 'N9120...","{'N27612': 1, 'N36699': 1, 'N64777': 1, 'N9120..."
2,U10000,"{'N50775': 1, 'N31958': 1, 'N53572': 0, 'N5472...","{'N10059': 1, 'N46978': 1, 'N53234': 1, 'N3345...","{'N10059': 1, 'N46978': 1, 'N53234': 1, 'N3345..."
3,U10002,"{'N759': 0, 'N17647': 0, 'N3159': 0, 'N19990':...","{'N27448': 1, 'N10059': 1, 'N28467': 1, 'N6477...","{'N27448': 1, 'N10059': 1, 'N28467': 1, 'N6477..."
4,U10004,"{'N31958': 0, 'N42844': 0, 'N19990': 0, 'N5940...","{'N38118': 1, 'N55805': 1, 'N43482': 1, 'N1540...","{'N38118': 1, 'N55805': 1, 'N43482': 1, 'N1540..."


In [114]:
def get_user_news_sparse_matrix(liked_target=1):
    rows, cols = [], []
    for _, (user, targets) in users[['user_id', 'history']].iterrows():
        user_id   = user_to_id[user]
        positives = list(map(lambda kv: news_to_id[kv[0]], filter(lambda kv: kv[1] == liked_target, targets.items())))
        rows += [user_id] * len(positives)
        cols += positives

    return csr_matrix(([liked_target] * len(rows), (rows, cols)), shape=(len(user_to_id), len(news_to_id)))

user_news = get_user_news_sparse_matrix()
news_user = user_news.T

user_news

<48593x42396 sparse matrix of type '<class 'numpy.longlong'>'
	with 1259580 stored elements in Compressed Sparse Row format>

In [115]:
model = implicit.als.AlternatingLeastSquares(factors=512, 
                                             regularization=0,
                                             iterations=40,
                                             calculate_training_loss=True)

model.fit(item_users=news_user)

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




In [125]:
from sklearn.metrics import ndcg_score, roc_auc_score
from metrics_eval import mrr, utils

def user_auc(user):
    user_id = id_to_user[user]
    targets = users[users['user_id'] == user_id]['impression_target'].iloc[0]
    news    = list(map(news_to_id.__getitem__, targets.keys()))
    y_true  = list(targets.values())
    item_factors = model.item_factors[news]
    user_factors = model.user_factors[user] 
    y_score = model.user_factors[user] @ model.item_factors[news].T
    return roc_auc_score(y_true, y_score)

def user_ndcg(user, k=5):
    user_id = id_to_user[user]
    targets = users[users['user_id'] == user_id]['impression_target'].iloc[0]
    news    = list(map(news_to_id.__getitem__, targets.keys()))
    y_true  = list(targets.values())
    y_score = model.user_factors[user] @ model.item_factors[news].T 
    return ndcg_score([y_true], [y_score], k=k)

def user_mrr(user):
    user_id = id_to_user[user]
    targets = users[users['user_id'] == user_id]['impression_target'].iloc[0]
    news    = list(map(news_to_id.__getitem__, targets.keys()))
    y_true  = np.array(list(targets.values()))
    y_score = model.user_factors[user] @ model.item_factors[news].T 
    return mrr(y_true, y_score)

In [126]:
from tqdm import trange

def calc_metric(metric, k=None):
    metric_scores = []
    for u in trange(users.shape[0]):
        if k is None:
            metric_score = metric(u)
        else:
            metric_score = metric(u, k)
        metric_scores.append(metric_score)

    return np.mean(metric_scores)

In [127]:
def mrr_score():
    return calc_metric(metric=user_mrr)

def auc():
    return calc_metric(metric=user_auc)

def ndcg(k):
    return calc_metric(metric=user_ndcg, k=k)

In [119]:
ndcg(k=5)

100%|██████████| 48593/48593 [04:41<00:00, 172.86it/s]


0.18723350823000268

In [120]:
ndcg(k=10)

100%|██████████| 48593/48593 [04:39<00:00, 174.16it/s]


0.24578133234775634

In [121]:
auc()

100%|██████████| 48593/48593 [05:13<00:00, 155.01it/s]


0.549

In [128]:
mrr_score()

100%|██████████| 48593/48593 [04:12<00:00, 192.19it/s]


0.224