In [1]:
import os

In [2]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# !pip install numpy==1.21.6
# !pip install pandas==1.3.5
!pip install lightfm==1.16
!pip install nmslib==2.1.1
!pip install rectools==0.2.0
!pip install seaborn==0.12.0
!pip install ipykernel==6.16.0
!pip install Jinja2==3.1.2
!pip install hnswlib==0.6.2
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm==1.16
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 5.7 MB/s 
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.16-cp38-cp38-linux_x86_64.whl size=746141 sha256=3394b96d46787b02b5f5c5c9cf7db1c205296457d0b2b144ff144bcb4b01a545
  Stored in directory: /root/.cache/pip/wheels/ec/bb/51/9c487d021c1373b691d13cadca0b65b6852627b1f3f43550fa
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nmslib==2.1.1
  Downloading nmslib-2.1.1-cp38-cp38-manylinux2010_x86_64.whl (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 7.4 MB/s 
Collecting pybind11<2.6.2
  Download

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Jinja2==3.1.2
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 6.4 MB/s 
Installing collected packages: Jinja2
  Attempting uninstall: Jinja2
    Found existing installation: Jinja2 2.11.3
    Uninstalling Jinja2-2.11.3:
      Successfully uninstalled Jinja2-2.11.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
notebook 5.7.16 requires jinja2<=3.0.0, but you have jinja2 3.1.2 which is incompatible.
google-colab 1.0.0 requires ipykernel~=5.3.4, but you have ipykernel 6.16.0 which is incompatible.
google-colab 1.0.0 requires ipython~=7.9.0, but you have ipython 8.7.0 which is incompatible.
google-colab 1.0.0 requires tornado~=6.0.4, but you have tornado 6.2 which is incompatible.
flask 1.1.4 requ

In [5]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

# LOAD DATA 

In [6]:
# download dataset by chunks
import requests
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|█████████▉| 78.6M/78.8M [00:05<00:00, 17.8MiB/s]

In [7]:
!unzip kion_train.zip

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [8]:
%%time
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

CPU times: user 3.42 s, sys: 347 ms, total: 3.76 s
Wall time: 5.24 s


# Preprocess

In [9]:
Columns.Datetime = 'last_watch_dt'

In [10]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

In [11]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [12]:
max_date = interactions[Columns.Datetime].max()

In [13]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [14]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [15]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [16]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

kion dataset download: 100%|██████████| 78.8M/78.8M [00:20<00:00, 17.8MiB/s]

In [17]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Prepare features

## User features

In [18]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [19]:
users.fillna('Unknown', inplace=True)

In [20]:
users.nunique()

user_id     840197
age              7
income           7
sex              3
kids_flg         2
dtype: int64

In [21]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [22]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840184,529394,age_25_34,income_40_60,Ж,0
840186,80113,age_25_34,income_40_60,Ж,0
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1


In [23]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [24]:
user_features.query(f"id == 973171")

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age
0,973171,income_60_90,income


# Item features

In [25]:
items.isnull().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [26]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [27]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [28]:
items.nunique()

item_id         14019
content_type        2
title           13454
title_orig       9724
release_year      104
genres           2559
countries         666
for_kids            2
age_rating          6
studios            38
directors        7414
actors          11830
description     13791
keywords        13583
dtype: int64

### Genre

In [29]:
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [30]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

In [31]:
content_feature

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [32]:
item_features = pd.concat((genre_feature, content_feature))

In [33]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# Metrics

In [34]:
metrics_name = {
#     'Precision': Precision,
#     'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(10, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [35]:
metrics

{'MAP@10': MAP(k=10, divide_by_k=False)}

# Models

## 1. Перебор  моделей  матричной факторизации и их  гиперпараметров


In [36]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = [16, 32]
N_FACTORS = (32, 64)
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = [0.05, 0.1] # Lightfm

In [110]:
# models = {
#     'popular': PopularModel(),
# }
models = dict()

In [111]:
implicit_models = {
    'ALS': AlternatingLeastSquares,
}

for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        for n_threads in NUM_THREADS:
            models[f"{implicit_name}_{n_factors}_{n_threads}"] = (
                ImplicitALSWrapperModel(
                    model=implicit_model(
                        factors=n_factors, 
                        random_state=RANDOM_STATE, 
                        num_threads=n_threads,
                    ),
                    fit_features_together=True,
                )
            )

In [115]:
lightfm_losses = ('bpr', 'warp')

# for loss in lightfm_losses[0]:
for n_factors in N_FACTORS:
    for lr in LEARNING_RATE:
        for n_threads in NUM_THREADS:
            models[f"LightFM_warp_{n_factors}_{lr}_{n_threads}"] = LightFMWrapperModel(
                LightFM(
                    no_components=n_factors, 
                    loss='warp', 
                    random_state=RANDOM_STATE,
                    learning_rate=lr,
                    user_alpha=USER_ALPHA,
                    item_alpha=ITEM_ALPHA,
                ),
                epochs=N_EPOCHS,
                num_threads=n_threads,
            )

In [116]:
models

{'ALS_32_16': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f069bd23970>,
 'ALS_32_32': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f069bd23ac0>,
 'ALS_64_16': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f069bd23be0>,
 'ALS_64_32': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f069bd23910>,
 'LightFM_warp_32_0.05_16': <rectools.models.lightfm.LightFMWrapperModel at 0x7f069bc76b80>,
 'LightFM_warp_32_0.05_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7f069bc76fd0>,
 'LightFM_warp_32_0.1_16': <rectools.models.lightfm.LightFMWrapperModel at 0x7f069bc55550>,
 'LightFM_warp_32_0.1_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7f076ce5b040>,
 'LightFM_warp_64_0.05_16': <rectools.models.lightfm.LightFMWrapperModel at 0x7f069bd23a60>,
 'LightFM_warp_64_0.05_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7f069bd23580>,
 'LightFM_warp_64_0.1_16': <rectools.models.lightfm.LightFMWrapperModel at 0x7f069bd23460>,


In [39]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.81 s, sys: 19.3 ms, total: 1.83 s
Wall time: 1.83 s


In [46]:
TEST_USERS = test[Columns.User].unique()

In [119]:
%%time
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model ALS_32_16...
Fitting model ALS_32_32...
Fitting model ALS_64_16...
Fitting model ALS_64_32...
Fitting model LightFM_warp_32_0.05_16...
Fitting model LightFM_warp_32_0.05_32...
Fitting model LightFM_warp_32_0.1_16...
Fitting model LightFM_warp_32_0.1_32...
Fitting model LightFM_warp_64_0.05_16...
Fitting model LightFM_warp_64_0.05_32...
Fitting model LightFM_warp_64_0.1_16...
Fitting model LightFM_warp_64_0.1_32...
CPU times: user 31min 42s, sys: 11min 39s, total: 43min 22s
Wall time: 23min 17s


In [120]:
df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [121]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,ALS_32_16,ALS_32_32,ALS_64_16,ALS_64_32,LightFM_warp_32_0.05_16,LightFM_warp_32_0.05_32,LightFM_warp_32_0.1_16,LightFM_warp_32_0.1_32,LightFM_warp_64_0.05_16,LightFM_warp_64_0.05_32,LightFM_warp_64_0.1_16,LightFM_warp_64_0.1_32
MAP@10,0.074298,0.074298,0.074455,0.074455,0.075969,0.07745,0.065249,0.06007,0.078488,0.077088,0.057556,0.059803


## 2. Метод приближенного поиска соседей для выдачи рекомендаций

In [93]:
# from annoy import AnnoyIndex

### 2.1. nmslib

In [37]:
import time
import nmslib

In [40]:
model = LightFMWrapperModel(
                LightFM(
                    no_components=64, 
                    loss='warp', 
                    random_state=RANDOM_STATE,
                    learning_rate=0.05,
                    user_alpha=USER_ALPHA,
                    item_alpha=ITEM_ALPHA,
                ),
                epochs=N_EPOCHS,
                num_threads=16,
            )

model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f8c872f7cd0>

In [43]:
# import pickle
# with open('LightFM_warp_64_05_16.pickle', 'wb') as f:
#       pickle.dump(model, f)

In [44]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [49]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [51]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (14019, 66)


(14019, 67)

In [53]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(756562, 67)

In [82]:
M = 32 # 48
efC = 32 # 100

# Number of neighbors 
K = 7 # 10

# Space name should correspond to the space name 
# used for brute-force search
space_name='negdotprod'

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 32, 'indexThreadQty': 4, 'efConstruction': 32, 'post': 0}


In [83]:
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

14019

In [84]:
# Create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
end = time.time() 
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 32, 'indexThreadQty': 4, 'efConstruction': 32}
Indexing time = 0.246432


In [85]:
# Setting query-time parameters
efS = 32 # 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 32}


In [86]:
query_matrix = augmented_user_embeddings[:1000, :]

In [87]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k=K, num_threads=num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=0.017231 (sec), per query=0.000017 (sec), per query adjusted for thread number=0.000069 (sec)


In [88]:
def recommend_all(query_factors, index_factors, topn=10):
    output = query_factors.dot(index_factors.T)
    argpartition_indices = np.argpartition(output, -topn)[:, -topn:]

    x_indices = np.repeat(np.arange(output.shape[0]), topn)
    y_indices = argpartition_indices.flatten()
    top_value = output[x_indices, y_indices].reshape(output.shape[0], topn)
    top_indices = np.argsort(top_value)[:, ::-1]

    y_indices = top_indices.flatten()
    top_indices = argpartition_indices[x_indices, y_indices]
    labels = top_indices.reshape(-1, topn)
    distances = output[x_indices, top_indices].reshape(-1, topn)
    return labels, distances

In [89]:
recommend_all(user_embeddings[[0], :], item_embeddings)

(array([[ 8867, 12982, 11758,  3527,  8287,  3174,  4153,  2256,   118,
          4036]]),
 array([[-182.33955196, -182.36987167, -182.55194864, -182.62783094,
         -182.64876535, -182.76404395, -182.99606975, -183.08485546,
         -183.15278364, -183.27976094]]))

In [90]:
query_matrix_not_augmented = user_embeddings[:1000, :]

In [91]:
%%timeit
recommend_all(query_matrix_not_augmented, item_embeddings)

264 ms ± 7.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [92]:
%%timeit
index.knnQueryBatch(query_matrix, k=K, num_threads=num_threads)

12.3 ms ± 3.8 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


### 2.2. hnswlib

In [97]:
import hnswlib

In [98]:
%%time
max_elements, dim = augmented_item_embeddings.shape
hnsw = hnswlib.Index("ip", dim)

hnsw.init_index(max_elements, M, efC)

hnsw.add_items(augmented_item_embeddings)

CPU times: user 401 ms, sys: 4.95 ms, total: 406 ms
Wall time: 215 ms


In [99]:
hnsw.set_ef(efS)

In [100]:
label, distance = hnsw.knn_query(query_matrix, k=k)

In [102]:
labels, distances = recommend_all(user_embeddings[:1000, :], item_embeddings)
print(labels)
print(distances)

[[ 8867 12982 11758 ...  2256   118  4036]
 [12982  8867  3174 ...  4153  7342   118]
 [ 2698  9969 13721 ...  4605 13806 11040]
 ...
 [ 8287  8867 11758 ... 11022 12139  3776]
 [ 8867 12982 11758 ...  4153  7342   118]
 [12982  8867  3527 ...  2256   118 10348]]
[[-182.33955196 -182.36987167 -182.55194864 ... -183.08485546
  -183.15278364 -183.27976094]
 [-218.54709345 -218.59873073 -218.7252877  ... -219.14839301
  -219.37792362 -219.38977312]
 [-196.58125302 -196.61233336 -196.74333421 ... -196.85070718
  -196.94088354 -196.97503472]
 ...
 [-199.52515645 -199.59756615 -199.65654775 ... -200.03982201
  -200.05097809 -200.05238019]
 [-217.30829224 -217.63007305 -217.73481768 ... -218.1789587
  -218.31211969 -218.34698054]
 [-190.20496296 -190.20882417 -190.33773205 ... -190.82189735
  -191.05412825 -191.18851875]]


## 3. Добавить 3 "аватаров" (искусственных пользователей) и посмотреть рекомендации итоговой модели на них. Объяснить почему добавили именно таких пользователей

In [62]:
# 1 - a user who watched films of only one genre ('детективы')
TEST_USER_1 = test[Columns.User].unique().max() + 1

user_features = user_features.append({'id': TEST_USER_1, 'value': 'М', 'feature': 'sex'},
                                     ignore_index=True)
user_features = user_features.append({'id': TEST_USER_1, 'value': 'age_25_34', 'feature': 'age'},
                                     ignore_index=True)
user_features = user_features.append({'id': TEST_USER_1, 'value': 'income_20_40', 'feature': 'income'},
                                     ignore_index=True)

In [63]:
# 2 - a user who has an account for a child (only watches cartoons)
TEST_USER_2 = test[Columns.User].unique().max() + 2

user_features = user_features.append({'id': TEST_USER_2, 'value': 'Ж', 'feature': 'sex'},
                                     ignore_index=True)
user_features = user_features.append({'id': TEST_USER_2, 'value': 'age_25_34', 'feature': 'age'},
                                     ignore_index=True)
user_features = user_features.append({'id': TEST_USER_2, 'value': 'income_40_60', 'feature': 'income'},
                                     ignore_index=True)

In [66]:
# 3 - the user is like the first one, but with different parameters
# (a user who watched films of only one genre ('детективы'))
TEST_USER_3 = test[Columns.User].unique().max() + 3

user_features = user_features.append({'id': TEST_USER_3, 'value': 'Ж', 'feature': 'sex'},
                                     ignore_index=True)
user_features = user_features.append({'id': TEST_USER_3, 'value': 'age_55_64', 'feature': 'age'},
                                     ignore_index=True)
user_features = user_features.append({'id': TEST_USER_3, 'value': 'income_90_150', 'feature': 'income'},
                                     ignore_index=True)

In [67]:
user_features.tail(9)

Unnamed: 0,id,value,feature
1760004,1097545,М,sex
1760005,1097545,age_25_34,age
1760006,1097545,income_20_40,income
1760007,1097546,Ж,sex
1760008,1097546,age_25_34,age
1760009,1097546,income_40_60,income
1760010,1097547,Ж,sex
1760011,1097547,age_55_64,age
1760012,1097547,income_90_150,income


In [77]:
# interactions for users who have watched detectives
detectives_df = item_features.query("value=='детективы'")

TEST_USER_1_interactions = list(detectives_df.sample(n=20, random_state=RANDOM_STATE)['id'])
TEST_USER_3_interactions = list(detectives_df.sample(n=20, random_state=RANDOM_STATE+1)['id'])

In [78]:
# interactions for user who have watched cartoons
cartoons_df = item_features.query("value=='мультфильм'")

TEST_USER_2_interactions = list(cartoons_df.sample(n=20, random_state=RANDOM_STATE)['id'])

In [94]:
total_dur	= train['total_dur'].median()
weight = train['weight'].mode().values[0]
total_dur, weight

(5462.0, 3)

In [109]:
dates_df = train.last_watch_dt.sample(n=20, random_state=RANDOM_STATE)

In [114]:
# add interactions for 1 user
for film_id, last_watch_dt in zip(TEST_USER_1_interactions, dates_df):
  train = train.append({'user_id': TEST_USER_1,
                        'item_id': film_id,
                        'last_watch_dt': last_watch_dt,
                        'total_dur': total_dur,
                        'watched_pct': 100.0,
                        'weight': weight,
                        },
                       ignore_index=True)

In [116]:
# add interactions for 2 user
for film_id, last_watch_dt in zip(TEST_USER_2_interactions, dates_df):
  train = train.append({'user_id': TEST_USER_2,
                        'item_id': film_id,
                        'last_watch_dt': last_watch_dt,
                        'total_dur': total_dur,
                        'watched_pct': 100.0,
                        'weight': weight,
                        },
                       ignore_index=True)

In [117]:
# add interactions for 3 user
for film_id, last_watch_dt in zip(TEST_USER_3_interactions, dates_df):
  train = train.append({'user_id': TEST_USER_3,
                        'item_id': film_id,
                        'last_watch_dt': last_watch_dt,
                        'total_dur': total_dur,
                        'watched_pct': 100.0,
                        'weight': weight,
                        },
                       ignore_index=True)

In [118]:
train.tail()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
3832766,1097547,1575,2021-06-08,5462.0,100.0,3
3832767,1097547,15669,2021-06-21,5462.0,100.0,3
3832768,1097547,9544,2021-07-05,5462.0,100.0,3
3832769,1097547,6353,2021-07-13,5462.0,100.0,3
3832770,1097547,2529,2021-06-21,5462.0,100.0,3


In [119]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [120]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f8c872f7cd0>

In [121]:
recos = model.recommend(
    users=[TEST_USER_1, TEST_USER_2, TEST_USER_3],
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [123]:
metric_values = calc_metrics(metrics, recos, test, train)
metric_values

{'MAP@10': 0.0}

In [137]:
rec_2 = recos[10:21]['item_id'].values
items.query("item_id in @rec_2").genres

1554                      мультфильм, фэнтези, приключения
1882                                   триллеры, детективы
3951     мультфильм, приключения, фантастика, семейное,...
4265             мультфильм, фэнтези, приключения, комедии
5712                          мультфильм, фэнтези, комедии
9698           приключения, мультфильм, детективы, комедии
9921                          мультфильм, фэнтези, мюзиклы
12035            мультфильм, фэнтези, приключения, комедии
12471                         мультфильм, фэнтези, комедии
12620                      фантастика, мультфильм, комедии
15352                     фэнтези, мультфильм, музыкальные
Name: genres, dtype: object

In [138]:
rec_1 = recos[:10]['item_id'].values
items.query("item_id in @rec_1").genres

767                                       драмы, мелодрамы
1882                                   триллеры, детективы
4955     криминал, детективы, фантастика, триллеры, бое...
6346                                     боевики, триллеры
6411      популярное, фантастика, триллеры, боевики, ужасы
6501                           драмы, военные, приключения
6689                                               комедии
9938                            ужасы, триллеры, мелодрамы
12050                                              комедии
13615                                      драмы, триллеры
Name: genres, dtype: object

In [139]:
rec_3 = recos[21:]['item_id'].values

In [140]:
rec_1, rec_3

(array([10440, 15297,  9728,  2657,  4151, 14317,  3734, 14431, 13865,
        15464]),
 array([15297, 13865,  4151, 12192, 16228,  9728,  6382,  9996,  4880]))

### Вывод:

- для 2 пользовотеля, который смотрел только мультфильмы, модель рекомендовала мультфильмы.
- для 1 и 3 пользовотелей, которые смотрели только детективы, модель рекомендовала детективы. При этом модель учла разный пол, возраст и зарплату пользователей и дала разные рекомендации. (возможно это еще зависело и от сопутствующих жанров и пр.)

## 4. Придумать как можно обработать рекомендации для холодных пользователей.

In [161]:
# popular for all time
popular_recs_all_time = list(interactions.item_id.value_counts()[:10])
popular_recs_all_time

[202457, 193123, 132865, 122119, 91167, 74803, 68581, 55043, 45367, 40372]

In [178]:
# popular for last mounth
import datetime as DT

interactions_month = interactions[interactions['last_watch_dt'] >
                                  (interactions.last_watch_dt.max() - DT.timedelta(days=30))]

popular_recs_month = list(interactions_month.item_id.value_counts()[:10])
popular_recs_month

[59226, 54373, 54115, 27752, 23812, 23253, 20967, 18909, 18878, 13305]

In [179]:
# popular for last week
import datetime as DT

interactions_week = interactions[interactions['last_watch_dt'] >
                                  (interactions.last_watch_dt.max() - DT.timedelta(days=7))]

popular_recs_week = list(interactions_week.item_id.value_counts()[:10])
popular_recs_week

[13068, 12636, 12534, 8029, 7024, 6335, 5458, 5253, 5116, 4844]