In [4]:
import os

import warnings
warnings.filterwarnings('ignore')

os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS


import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PureSVDModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

In [8]:
! pwd



DATA_PATH = Path("kion_train")
DATA_PATH

/home/iuliiasolomennikova/!!!RecSysService/notebooks


PosixPath('kion_train')

In [19]:
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

In [20]:
Columns.Datetime = 'last_watch_dt'


# примеведем все к Datetime
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()

# if user wathed more than 10, 3, 1, percent - set 3 балла этому взаим-ию
# if less - 1 балл
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)


In [21]:
# отправим в test  max  interaction for the last week
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [22]:
# drop записи где длина less 300
train.drop(train.query("total_dur < 300").index, inplace=True)



In [23]:
# отфильтруем cold пользователей from test
cold_users = set(test[Columns.User]) - set(train[Columns.User])

# in test only users with info about interaction
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)


### User features

In [24]:
users.isnull().sum()


user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [25]:
users.fillna('Unknown', inplace=True)
users.nunique()


user_id     840197
age              7
income           7
sex              3
kids_flg         2
dtype: int64

In [26]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()


In [27]:
users


Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840184,529394,age_25_34,income_40_60,Ж,0
840186,80113,age_25_34,income_40_60,Ж,0
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1


In [28]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head(10)

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex
8,846063,Ж,sex
9,401219,Ж,sex
11,312520,Ж,sex
12,555088,Ж,sex
13,382508,М,sex


### Item features

In [29]:
items.isnull().sum()


item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [30]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items.nunique()


item_id         14019
content_type        2
title           13454
title_orig       9724
release_year      104
genres           2559
countries         666
for_kids            2
age_rating          6
studios            38
directors        7414
actors          11830
description     13791
keywords        13583
dtype: int64

In [31]:
items.head(3)


Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


### Genre

In [32]:
# make the table

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head(4)

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre


In [34]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

content_feature.head(3)

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type


In [35]:
item_features = pd.concat((genre_feature, content_feature))
item_features.head(3)

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre


### Метрики:

In [36]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [37]:
metrics

{'Precision@1': Precision(k=1),
 'Precision@2': Precision(k=2),
 'Precision@3': Precision(k=3),
 'Precision@4': Precision(k=4),
 'Precision@5': Precision(k=5),
 'Precision@6': Precision(k=6),
 'Precision@7': Precision(k=7),
 'Precision@8': Precision(k=8),
 'Precision@9': Precision(k=9),
 'Precision@10': Precision(k=10),
 'Recall@1': Recall(k=1),
 'Recall@2': Recall(k=2),
 'Recall@3': Recall(k=3),
 'Recall@4': Recall(k=4),
 'Recall@5': Recall(k=5),
 'Recall@6': Recall(k=6),
 'Recall@7': Recall(k=7),
 'Recall@8': Recall(k=8),
 'Recall@9': Recall(k=9),
 'Recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@2': MAP(k=2, divide_by_k=False),
 'MAP@3': MAP(k=3, divide_by_k=False),
 'MAP@4': MAP(k=4, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@6': MAP(k=6, divide_by_k=False),
 'MAP@7': MAP(k=7, divide_by_k=False),
 'MAP@8': MAP(k=8, divide_by_k=False),
 'MAP@9': MAP(k=9, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False)}

## Модель:

LightFM

In [36]:
K_RECOS = 10      # num of recomendations
RAND_ST = 777
THREADS = 16
N_FACTORS = (32, 64, 128, 160)      # num of factors
ITERATIONS = (15, 20)

best_components = 17
best_loss = 'logistic'
best_rho = 0.93
best_lr = 0.05

epsilon = 3

dataset = Dataset.construct(
        interactions_df=train)

lightfm = LightFMWrapperModel(
        model = LightFM(
            no_components = best_components,
            learning_schedule = 'adadelta',
            loss = best_loss,
            rho = best_rho,
            epsilon = epsilon,
            learning_rate = best_lr,
            random_state = RAND_ST
        )
    )
lightfm.fit(dataset)

recs = lightfm.recommend(
    users=test[Columns.User].unique(),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

metric_values = calc_metrics(metrics, recs, test, train)
model = lightfm

In [39]:
! pip install nmslib 



In [41]:
import nmslib
#  методом приближенного поиска соседей для выдачи рекомендаций

import time


**Non-Metric Space Library** (*NMSLIB*) is an efficient cross-platform similarity search library and a toolkit for evaluation of similarity search methods. The core-library does not have any third-party dependencies.

Goal - searching in generic and non-metric spaces.


**parameters**:

- `ef` - the size of the dynamic list for the nearest neighbors (used during the search)
- `k` - number of nearest neighbors 
- `M` - the number of bi-directional links created for every new element during construction
- `ef_construction` - controls the index_time/index_accuracy
- `num_elements` - defines the maximum number of elements in the index






In [45]:
user_embeddings, item_embeddings = model.get_vectors(dataset)
user_embeddings.shape, item_embeddings.shape


((756562, 19), (14019, 19))

In [46]:
def aug_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [48]:
max_norm, augmented_item_embeddings = aug_inner_product(item_embeddings)
augmented_item_embeddings.shape

print('initial item shape: ', item_embeddings.shape)
print('augmented item shape: ', augmented_item_embeddings.shape)



initial item shape:  (14019, 19)
augmented item shape:  (14019, 20)


In [49]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape
print('augmented users shape: ', augmented_item_embeddings.shape)




augmented users shape:  (14019, 20)


In [50]:
user_id = 30

In [51]:
print(user_embeddings[user_id])
print(augmented_user_embeddings[user_id])

[ 4.16040421e-05  1.00000000e+00 -3.95760126e-03 -1.83540378e-02
 -2.04288643e-02 -1.30786849e-02 -4.57997806e-03 -2.14250293e-02
 -2.50754282e-02  4.01278085e-04 -1.04984147e-02 -1.43987418e-03
  1.75281353e-02  4.32534050e-03 -2.50964090e-02 -2.82217208e-02
  1.71797909e-03 -1.08068986e-02  1.51321515e-02]
[ 4.16040421e-05  1.00000000e+00 -3.95760126e-03 -1.83540378e-02
 -2.04288643e-02 -1.30786849e-02 -4.57997806e-03 -2.14250293e-02
 -2.50754282e-02  4.01278085e-04 -1.04984147e-02 -1.43987418e-03
  1.75281353e-02  4.32534050e-03 -2.50964090e-02 -2.82217208e-02
  1.71797909e-03 -1.08068986e-02  1.51321515e-02  0.00000000e+00]


In [52]:
item_id = 0

print(item_embeddings[item_id])
print(augmented_item_embeddings[item_id]
)

[ 1.00000000e+00  9.00898933e+00 -4.25672494e-02 -3.99182662e-02
 -9.40565672e-03 -1.85679644e-02  8.05287098e-04 -9.72260721e-04
 -2.37680078e-02  3.34606804e-02 -1.96115337e-02 -3.65121849e-02
 -4.95851599e-02 -8.79757181e-02 -1.40575450e-02 -6.44353703e-02
 -1.09365452e-02  4.64421100e-05 -1.43443458e-02]
[ 1.00000000e+00  9.00898933e+00 -4.25672494e-02 -3.99182662e-02
 -9.40565672e-03 -1.85679644e-02  8.05287098e-04 -9.72260721e-04
 -2.37680078e-02  3.34606804e-02 -1.96115337e-02 -3.65121849e-02
 -4.95851599e-02 -8.79757181e-02 -1.40575450e-02 -6.44353703e-02
 -1.09365452e-02  4.64421100e-05 -1.43443458e-02  9.38623852e+00]


### Set index parameters


In [54]:
M = 64
efc = 128
threads = 4

# Number of neighbors 
K=10

# Space name must correspond space name used from brute-force search
space_name='negdotprod'



index_time_params = {'M': M, 'indexThreadQty': threads, 'efConstruction': efc, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 64, 'indexThreadQty': 4, 'efConstruction': 128, 'post': 0}


In [56]:
# start library, specify  space,  type of vector and add data points

index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

14019

In [57]:
# Create  index

start = time.time()
index_time_params = {'M': M, 'indexThreadQty': threads, 'efConstruction': efc}

index.createIndex(index_time_params) 
end = time.time() 

print('Index-time parameters', index_time_params)
print('Index time = %f' % (end-start))

Index-time parameters {'M': 64, 'indexThreadQty': 4, 'efConstruction': 128}
Index time = 0.612103


In [58]:
# set 

# Setting query-time parameters
efs = 128
query_time_params = {'efSearch': efs}

print('Setting query-time parameters', query_time_params)

index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 128}


In [60]:
# Querying

query_matrix = augmented_user_embeddings

query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = threads)

end = time.time() 
print('kNN time overall=%f (seconds), per query=%f (sec), per query adjusted for number threads=%f (sec)' % 
      (end-start, float(end-start)/query_qty, threads*float(end-start)/query_qty))

kNN time overall=9.231518 (seconds), per query=0.000012 (sec), per query adjusted for number threads=0.000049 (sec)


In [61]:
def recom_all(query_factors, index_factors, topn=10):
    output = query_factors.dot(index_factors.T)
    argpart_indices = np.argpartition(output, -topn)[:, -topn:]

    x_indices = np.repeat(np.arange(output.shape[0]), topn)
    y_indices = argpart_indices.flatten()
    top_value = output[x_indices, y_indices].reshape(output.shape[0], topn)
    top_indices = np.argsort(top_value)[:, ::-1]

    y_indices = top_indices.flatten()
    top_indices = argpart_indices[x_indices, y_indices]
    labels = top_indices.reshape(-1, topn)
    
    distance = output[x_indices, top_indices].reshape(-1, topn)
    return labels, distance

In [64]:
recom_all(user_embeddings[[0], :], item_embeddings)


(array([[ 31,  19,  32,  43, 121,  62, 173, 268,  86, 120]]),
 array([[13.42027971, 13.31198084, 12.91483647, 12.84448338, 12.59528044,
         12.4585795 , 12.22174866, 11.86915594, 11.75324082, 11.5832679 ]]))

In [65]:
query_matrix_not_augmented = user_embeddings[:1000, :]


In [66]:
recom_all(query_matrix_not_augmented, item_embeddings)


(array([[ 31,  19,  32, ..., 268,  86, 120],
        [ 31,  19,  32, ..., 268,  86, 120],
        [ 31,  19,  32, ..., 268,  86, 120],
        ...,
        [ 31,  19,  32, ..., 268,  86, 120],
        [ 31,  19,  32, ..., 268,  86, 120],
        [ 31,  19,  32, ..., 268,  86, 120]]),
 array([[13.42027971, 13.31198084, 12.91483647, ..., 11.86915594,
         11.75324082, 11.5832679 ],
        [14.03862191, 13.92197333, 13.51989205, ..., 12.47808338,
         12.35877675, 12.19037274],
        [13.02669595, 12.91184065, 12.51605045, ..., 11.46585733,
         11.35368119, 11.18538683],
        ...,
        [13.67071498, 13.54837689, 13.1581406 , ..., 12.11172517,
         11.9903559 , 11.82591483],
        [14.00382051, 13.89097907, 13.50001558, ..., 12.45062528,
         12.33538802, 12.16623413],
        [13.037969  , 12.92424304, 12.52638511, ..., 11.47838891,
         11.3676066 , 11.19984053]]))

In [67]:
index.knnQueryBatch(query_matrix, k = K, num_threads = threads)


[(array([ 31,  19,  32,  43, 121,  62, 173, 268,  86, 120], dtype=int32),
  array([-13.42028 , -13.311979, -12.914836, -12.844482, -12.595281,
         -12.45858 , -12.221749, -11.869156, -11.753242, -11.583268],
        dtype=float32)),
 (array([ 31,  19,  32,  43, 121,  62, 173, 268,  86, 120], dtype=int32),
  array([-14.038622, -13.921972, -13.519891, -13.451093, -13.206717,
         -13.066833, -12.830372, -12.478082, -12.358776, -12.190372],
        dtype=float32)),
 (array([ 31,  19,  32,  43, 121,  62, 173, 268,  86, 120], dtype=int32),
  array([-13.026694 , -12.91184  , -12.51605  , -12.447765 , -12.202946 ,
         -12.060916 , -11.822695 , -11.4658575, -11.353682 , -11.185387 ],
        dtype=float32)),
 (array([ 31,  19,  32,  43, 121,  62, 173, 268,  86, 120], dtype=int32),
  array([-13.011549 , -12.895891 , -12.499902 , -12.42857  , -12.187348 ,
         -12.0404625, -11.809804 , -11.454793 , -11.337338 , -11.16734  ],
        dtype=float32)),
 (array([ 31,  19,  32,  43,

In [69]:
! pip install hnswlib

import hnswlib


Collecting hnswlib
  Downloading hnswlib-0.7.0.tar.gz (33 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25ldone
[?25h  Created wheel for hnswlib: filename=hnswlib-0.7.0-cp39-cp39-linux_x86_64.whl size=182538 sha256=dc8f2dc05e877b7b1d439cb0288a80b2ee9bf006662001a3e7771ce3e2403472
  Stored in directory: /home/iuliiasolomennikova/.cache/pip/wheels/ba/26/61/fface6c407f56418b3140cd7645917f20ba6b27d4e32b2bd20
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.7.0


In [70]:
max_elements, dim = augmented_item_embeddings.shape
hnsw = hnswlib.Index("ip", dim) # might be L2, cosine or ip

# Init index - the max number of elems must be known beforehand
hnsw.init_index(max_elements, M, efc)

# Element insertion (can be called several times)
hnsw.add_items(augmented_item_embeddings)

In [72]:
# Control recall by setting ef, where ef > k
hnsw.set_ef(efc)

label, distance = hnsw.knn_query(query_matrix, k=k)


In [75]:
labels, distances = recom_all(user_embeddings[:1000, :], item_embeddings)

print(labels)
print(distances)

[[ 31  19  32 ... 268  86 120]
 [ 31  19  32 ... 268  86 120]
 [ 31  19  32 ... 268  86 120]
 ...
 [ 31  19  32 ... 268  86 120]
 [ 31  19  32 ... 268  86 120]
 [ 31  19  32 ... 268  86 120]]
[[13.42027971 13.31198084 12.91483647 ... 11.86915594 11.75324082
  11.5832679 ]
 [14.03862191 13.92197333 13.51989205 ... 12.47808338 12.35877675
  12.19037274]
 [13.02669595 12.91184065 12.51605045 ... 11.46585733 11.35368119
  11.18538683]
 ...
 [13.67071498 13.54837689 13.1581406  ... 12.11172517 11.9903559
  11.82591483]
 [14.00382051 13.89097907 13.50001558 ... 12.45062528 12.33538802
  12.16623413]
 [13.037969   12.92424304 12.52638511 ... 11.47838891 11.3676066
  11.19984053]]


In [78]:
# Recommen-s for user

user_id = 0
! p

/home/iuliiasolomennikova/Desktop/RecoServiceTemplate_2023/notebooks


In [79]:
# Save item_embeddings
# save user_embeddings


! mkdir /home/iuliiasolomennikova/Desktop/RecoServiceTemplate_2023/notebooks/data/kion_train/offline_reco_df

import joblib


                 
                 
joblib.dump(item_embeddings, '/home/iuliiasolomennikova/Desktop/RecoServiceTemplate_2023/notebooks/data/kion_train/offline_reco_df/ANN_item_embeddings.sav')
joblib.dump(user_embeddings, '/home/iuliiasolomennikova/Desktop/RecoServiceTemplate_2023/notebooks/data/kion_train/offline_reco_df/ANN_user_embeddings.sav')


joblib.dump(label, '/home/iuliiasolomennikova/Desktop/RecoServiceTemplate_2023/notebooks/data/kion_train/offline_reco_df/ANN_label.sav')
joblib.dump(distance, '/home/iuliiasolomennikova/Desktop/RecoServiceTemplate_2023/notebooks/data/kion_train/offline_reco_df/ANN_distance.sav')

['/home/iuliiasolomennikova/Desktop/RecoServiceTemplate_2023/notebooks/data/kion_train/offline_reco_df/ANN_distance.sav']