In [1]:
import datetime
import os
import pickle
import requests
import urllib
from itertools import groupby
from operator import itemgetter
from typing import Dict, Tuple
from multiprocessing import Pool

import numpy as np
import pandas as pd
import sqlalchemy as sa
from scipy.sparse import save_npz, load_npz, csr_matrix
from tqdm import tqdm

os.environ['MKL_NUM_THREADS'] = '1'
DT = datetime.datetime.now().strftime('%Y-%m-%d')

# Загружаем csv

`user_id` - результат рандома
`item_id` - результат рандома

In [32]:
user_item_views_df = pd.read_csv('data/user_item_views.zip', compression='zip')
print(user_item_views_df.shape)
user_item_views_df.head(3)

(1975696, 4)


Unnamed: 0,user_id,item_id,show_timestamp,show_duration
0,912948920,1587935070,1119307,323
1,1882728205,1466874188,1115796,1428
2,382105433,276839040,1116585,921


Для трансформации в csr создаём индексы

In [3]:
unique_users = user_item_views_df.user_id.unique()
unique_items = user_item_views_df.item_id.unique()
item_to_id = {j: i for i, j in enumerate(unique_items)}
id_to_item = {j: i for i, j in item_to_id.items()}
user_to_id = {j: i for i, j in enumerate(unique_users)}
print('Индекс создан: %d строк %d столбцов' % (len(user_to_id), len(item_to_id)))

Индекс создан: 168756 строк 9991 столбцов


## Трансформация в csr

Для каждого пользователя оставляем top-20 последних просмотров

In [4]:
%%time
HISTORY_TOP = 20
user_item_views_df['rank'] = (
    user_item_views_df
    .groupby(by=['user_id'])['show_timestamp']
    .rank(method='first', ascending=False)
)
ui_slim_df = user_item_views_df[user_item_views_df['rank'] < HISTORY_TOP][['user_id', 'item_id']]
num_rows = len(user_to_id)
num_cols = len(item_to_id)
entries = np.ones(ui_slim_df.shape[0])
rows = tuple(user_to_id[i] for i in ui_slim_df.user_id.values)
cols = tuple(item_to_id[i] for i in ui_slim_df.item_id.values)

train_set_csr = csr_matrix(
    (entries, (rows, cols)),
    shape=(num_rows, num_cols),
    dtype=np.float32
)
train_set_csr
save_npz(f'train_set_{DT}.npz', train_set_csr)
print('Данные сохранены в %s' % f'train_set_{DT}.npz')

Данные сохранены в train_set_2020-11-26.npz
CPU times: user 5.49 s, sys: 325 ms, total: 5.82 s
Wall time: 5.83 s


# Обучение модели

In [5]:
!pip install implicit==0.4.2

Collecting implicit==0.4.2
  Downloading implicit-0.4.2.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 251 kB/s eta 0:00:01
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25ldone
[?25h  Created wheel for implicit: filename=implicit-0.4.2-cp38-cp38-linux_x86_64.whl size=4756868 sha256=65219971a60e1c0a08e849073d851c7b30d6c4da75d19f1a188380c7c9ff0f76
  Stored in directory: /home/jovyan/.cache/pip/wheels/97/dd/5f/df702090a221c1b1cc4683950b6d086eeee98d37a547f20f8f
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.2


In [6]:
from implicit.als import AlternatingLeastSquares

implict_als_params = {'factors': 4, 'iterations': 1}
model = AlternatingLeastSquares(**implict_als_params)
# транспонируем обязательно!
model.fit(train_set_csr.T.tocsr())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




## Пример работы модели

In [7]:
def id_to_content_df(ids: np.array, content_df: pd.DataFrame, id_to_item):
    items = tuple(id_to_item[i] for i in ids)
    result_df = content_catalog[
        content_catalog.item_id.isin(items)
    ]
    return result_df

content_catalog = pd.read_csv('data/content_catalog.zip', compression='zip')

random_history = train_set_csr[np.random.randint(low=0, high=train_set_csr.shape[0])]
id_to_content_df(random_history.nonzero()[1], content_catalog, id_to_item)

Unnamed: 0,item_id,title
6820,1622781290,Гладиатор
10323,1666115004,Илон Маск: Настоящий железный человек
12414,166044284,Карибские острова 3D: Погружение с акулами
15345,1593139110,Тачки 3
19707,1700203196,Разрушитель
28254,1507387779,Вольт
29777,160084426,Темные воды


Проверим, что рекоммендует модель

In [8]:
recommends = model.recommend(
            userid = 0,
            user_items=random_history,
            N=10,
            filter_already_liked_items=True,
            recalculate_user=True
)
id_to_content_df([rec[0] for rec in recommends], content_catalog, id_to_item)

Unnamed: 0,item_id,title
4098,984316348,Гарри Поттер и Принц-полукровка
10630,784227936,Эверест
12028,125673055,Камуфляж и шпионаж
13380,1757140863,Корпорация монстров
25068,1055805812,Тачки
26561,1729092206,Гарри Поттер и узник Азкабана
27442,157249585,Капоне. Лицо со шрамом
28064,1129147321,Волк с Уолл-стрит
28796,538560695,Разговорник
29850,1858036661,Фиксики против Кработов


# Валидация модели

In [9]:
with open('data/ground_truth_dataset.pkl', 'rb') as f:
    ground_truth_dataset = pickle.load(f)
with open('data/test_dataset.pkl', 'rb') as f:
    test_dataset = pickle.load(f)
print(len(test_dataset), len(ground_truth_dataset))

13163 13163


In [10]:
def get_als_action_history_vector(item_to_id: Dict[int, int], action_history, binary=True) -> np.ndarray:
    """Получить историю действий для ALS

    :param item_to_id: справочник контента ALS
    :return:
    """
    als_action_history_vector = np.zeros(len(item_to_id), dtype=int)
    for iid, item_attr in action_history.items():
        if iid in item_to_id.keys():
            if binary:
                als_action_history_vector[item_to_id[iid]] = 1
            else:
                als_action_history_vector[item_to_id[iid]] = item_attr
    return als_action_history_vector

def vectorize_action_history(action_history):
    res = get_als_action_history_vector(item_to_id, action_history)
    return res

with Pool(5) as p:
    test_dataset_vectors = p.map(vectorize_action_history, test_dataset)
    ground_truth_dataset_vectors = p.map(vectorize_action_history, ground_truth_dataset)
print(len(test_dataset_vectors))

13163


Готовим данные для мультипроцессинга - объединяем в один массив историю пользователя и валидационные просмотры

In [11]:
train_valid_pairs = []
for test_user_id in range(len(test_dataset_vectors)):
    train_valid_pairs.append((
        csr_matrix(test_dataset_vectors[test_user_id]),
        ground_truth_dataset_vectors[test_user_id].nonzero()[0]
    ))

In [12]:
%%time

N = 40
testing_model = model

def top_n_recommends(watch_history):
    top_n_result = testing_model.recommend(
            userid = 0,
            user_items=watch_history[0],
            N=N,
            filter_already_liked_items=True,
            recalculate_user=True
    )
    hit = 0
    if len(watch_history[1]) > 0 and np.intersect1d(watch_history[1], top_n_result).size > 0:
        hit = 1
    return hit

with Pool(5) as p:
    hits = p.map(top_n_recommends, train_valid_pairs)
print(sum(hits)/len(hits))

0.17412443971738964
CPU times: user 1.08 s, sys: 213 ms, total: 1.3 s
Wall time: 41.1 s


# Бейзлайны

top 100 популярного

In [13]:
%%time

N = 40
content_popularity = np.asarray(train_set_csr.sum(axis=0)).reshape(-1)
top_100_popular_items = np.argsort(-content_popularity)[:100]

def top_n_recommends(watch_history):
    top_n_result = top_100_popular_items[:N]
    hit = 0
    if len(watch_history[1]) > 0 and np.intersect1d(watch_history[1], top_n_result).size > 0:
        hit = 1
    return hit

with Pool(5) as p:
    hits = p.map(top_n_recommends, train_valid_pairs)
print(sum(hits)/len(hits))

0.18438046038137204
CPU times: user 534 ms, sys: 274 ms, total: 807 ms
Wall time: 978 ms


Рандом

In [14]:
%%time

N = 50
content_popularity = np.asarray(train_set_csr.sum(axis=0)).reshape(-1)
all_content = np.array(list(id_to_item.keys()))

def top_n_recommends(watch_history):
    top_n_result = np.random.choice(all_content, size=N, replace=True)
    hit = 0
    if len(watch_history[1]) > 0 and np.intersect1d(watch_history[1], top_n_result).size > 0:
        hit = 1
    return hit

with Pool(5) as p:
    hits = p.map(top_n_recommends, train_valid_pairs)
print(sum(hits)/len(hits))

0.008432728101496619
CPU times: user 544 ms, sys: 228 ms, total: 772 ms
Wall time: 1.04 s


Пример с обучением модели

In [15]:
implict_als_params = {'factors': 20, 'iterations': 30}
model = AlternatingLeastSquares(**implict_als_params)
# транспонируем обязательно!
model.fit(train_set_csr.T.tocsr())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [16]:
%%time

N = 50
testing_model = model

def top_n_recommends(watch_history):
    top_n_result = testing_model.recommend(
            userid = 0,
            user_items=watch_history[0],
            N=N,
            filter_already_liked_items=True,
            recalculate_user=True
    )
    hit = 0
    if len(watch_history[1]) > 0 and np.intersect1d(watch_history[1], top_n_result).size > 0:
        hit = 1
    return hit

with Pool(5) as p:
    hits = p.map(top_n_recommends, train_valid_pairs)
print(sum(hits)/len(hits))

0.25845172073235584
CPU times: user 1.19 s, sys: 157 ms, total: 1.35 s
Wall time: 52.5 s


Проверяем рекомендации на обученной модели

In [17]:
recommends = model.recommend(
            userid = 0,
            user_items=random_history,
            N=10,
            filter_already_liked_items=True,
            recalculate_user=True
)
id_to_content_df([rec[0] for rec in recommends], content_catalog, id_to_item)

Unnamed: 0,item_id,title
4688,446911865,Фиксики: Большой секрет
7185,465257123,Основатель
8382,866816808,Тачки 2
13380,1757140863,Корпорация монстров
16205,1687588185,Щенячий патруль: Мегащенки
25068,1055805812,Тачки
26592,821523592,В поисках Немо
27811,442883224,Университет монстров
28064,1129147321,Волк с Уолл-стрит
29850,1858036661,Фиксики против Кработов


# Загружаем JSON

Нужно распаковать архив и подготовить его для загрузки в Mongo

In [50]:
import tarfile
with tarfile.open('data/json_views.tar.gz', 'r') as json_tar:
    json_tar.extractall('data')