In [None]:
!pip install rectools optuna > None

In [None]:
import os

import pandas as pd
import numpy as np

import requests
from tqdm.auto import tqdm

from rectools.metrics import Precision, Recall, MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.models.popular import PopularModel
from rectools import Columns
from rectools.dataset import Dataset

from lightfm import LightFM

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization

from sklearn.model_selection import train_test_split

import optuna
import dill

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd gdrive/MyDrive/recsys/

[Errno 2] No such file or directory: 'gdrive/MyDrive/recsys/'
/content/gdrive/MyDrive/recsys


In [None]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [None]:
SEED = 2022
K_RECOS = 10

## Get data

In [None]:
def load_data():
  interactions = pd.read_csv('kion_train/interactions.csv')
  users = pd.read_csv('kion_train/users.csv')
  items = pd.read_csv('kion_train/items.csv')
  interactions.rename(
    columns={
        'last_watch_dt': 'datetime',
        'total_dur': 'weight',
    },
    inplace=True,
  )

  interactions['datetime'] = pd.to_datetime(interactions['datetime'])
  interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)
  return interactions, users, items

In [None]:
interactions, users, items = load_data()

## Train/test split

In [None]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f'min дата в interactions: {min_date}')
print(f'max дата в interactions: {max_date}')
print(f'Продолжительность: {max_date - min_date}')

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [None]:
ranker_train_size = 0.7
ranker_val_size = 0.15
ranker_test_size = 0.15

In [None]:
ranker_days_count = 30
ranker_data = interactions[
    (interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=ranker_days_count))
]

In [None]:
train_val_users, test_users = train_test_split(
    ranker_data['user_id'].unique(), random_state=SEED, test_size=ranker_test_size
)

train_users, val_users = train_test_split(
    train_val_users, random_state=SEED, test_size=ranker_val_size / (ranker_train_size + ranker_val_size)
)

In [None]:
ranker_days_count = 30
train = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

## Prepare user features

In [None]:
users.isna().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [None]:
users.fillna('Unknown', inplace=True)

In [None]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0


In [None]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


## Prepare item features

In [None]:
items.isna().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [None]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


### Genre

In [None]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [None]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


### Binned release_year

In [None]:
pd.qcut(items['release_year'], q=10).head()

0      (1984.0, 2004.0]
1      (2012.0, 2014.0]
2      (2009.0, 2012.0]
3      (2014.0, 2016.0]
4    (1896.999, 1984.0]
Name: release_year, dtype: category
Categories (10, interval[float64, right]): [(1896.999, 1984.0] < (1984.0, 2004.0] < (2004.0, 2009.0] <
                                            (2009.0, 2012.0] ... (2016.0, 2018.0] <
                                            (2018.0, 2019.0] < (2019.0, 2020.0] < (2020.0, 2021.0]]

In [None]:
items['binned_r_year'] = pd.qcut(items['release_year'], q=10, labels=list(range(10)))

In [None]:
release_year_feature = items.reindex(columns=[Columns.Item, "binned_r_year"])
release_year_feature.columns = ["id", "value"]
release_year_feature["feature"] = "binned_r_year"
release_year_feature.head()

Unnamed: 0,id,value,feature
0,10711,1,binned_r_year
1,2508,4,binned_r_year
2,10716,3,binned_r_year
3,7868,5,binned_r_year
4,16268,0,binned_r_year


### countries

In [None]:
items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

Unnamed: 0,id,value,feature
0,10711,испания,country
1,2508,сша,country
2,10716,канада,country
3,7868,великобритания,country
4,16268,ссср,country


###  Combine all

In [None]:
item_features = pd.concat((genre_feature, content_feature, country_feature, release_year_feature))
item_features[item_features['id'] == 10711]

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
0,10711,film,content_type
0,10711,испания,country
0,10711,1,binned_r_year


## Init dataset

In [None]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "country", 'binned_r_year', 'content_type'],
)

## Models

### ImplicitALS

In [None]:
model_obj = ImplicitALSWrapperModel(
      model=AlternatingLeastSquares(
        factors=128, 
        regularization=0.21,
        random_state=SEED, 
      ),
      fit_features_together=True,
    )

model_obj.fit(dataset)

In [None]:
with open('implicit_als_base.dill', 'wb') as f:
   dill.dump(model_obj, f)

### LightFM

In [None]:
model_obj = LightFMWrapperModel(
  model=LightFM(
    no_components=32, 
    loss='warp', 
    random_state=SEED,
    learning_rate=0.05,
    user_alpha=0.1,
    item_alpha=0.1,
  ),
  epochs=2,
  num_threads=1,
)

model_obj.fit(dataset)

In [None]:
with open('lightfm_base.dill', 'wb') as f:
   dill.dump(model_obj, f)

### Popular

In [None]:
popular_model = PopularModel()
popular_model.fit(dataset);

In [None]:
with open('popular_base.dill', 'wb') as f:
   dill.dump(popular_model, f)

## Get candidates

In [None]:
def calc_metrics_(candidates_df, rank_col, k=10):
    metrics = {
        'map@k': MAP(k=k),
        'recall@k': Recall(k=k),
    }
    return calc_metrics(
        metrics=metrics,
        reco=(
            candidates_df
            .rename(columns={rank_col: Columns.Rank})
            [[Columns.User, Columns.Item, Columns.Rank]]
            [candidates_df[Columns.User].isin(test_users)]
        ),
        interactions=(
            ranker_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [ranker_data[Columns.User].isin(test_users)]
        ), 
        prev_interactions=(
            train
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [train[Columns.User].isin(test_users)]
        ),
        catalog=items['item_id'].unique()
    )

In [None]:
def get_candidates(top_n, model):
    recos = model.recommend(
      users=train['user_id'].unique(),
      dataset=dataset,
      k=top_n,
      filter_viewed=True,
    )
    return recos

### LightFM

In [None]:
with open('lightfm_base.dill', 'rb') as f:
   model = dill.load(f)

In [None]:
%%time

for n in [20, 30, 40, 50]:
  candidates = get_candidates(n, model)
  metric_values = calc_metrics_(candidates, 'lfm_rank', k=n)
  print(f'n: {n}, metric_values: {metric_values}')

n: 20, metric_values: {'recall@k': 0.10679115810044539, 'map@k': 0.04464958784948523}
n: 30, metric_values: {'recall@k': 0.11752778159657461, 'map@k': 0.04531977217082661}
n: 40, metric_values: {'recall@k': 0.12244546439203735, 'map@k': 0.045549830592495136}
n: 50, metric_values: {'recall@k': 0.12551395189878733, 'map@k': 0.04566728509052326}
CPU times: user 39min 21s, sys: 18min 16s, total: 57min 38s
Wall time: 31min 18s


Best n

In [None]:
candidates = get_candidates(30, model)
candidates.to_csv('lfm_candidates.csv', index=False)
candidates.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,10440,-4.1e-05,1
1,176549,15297,-4.1e-05,2
2,176549,4151,-4.1e-05,3
3,176549,13865,-4.1e-05,4
4,176549,142,-4.2e-05,5


### ImplicitALS

In [None]:
with open('implicit_als_base.dill', 'rb') as f:
   model = dill.load(f)

In [None]:
%%time

for n in [20, 30, 40, 50]:
  candidates = get_candidates(n, model)
  metric_values = calc_metrics_(candidates, 'als_rank', k=n)
  print(f'n: {n}, metric_values: {metric_values}')

n: 20, metric_values: {'recall@k': 0.09358493476354579, 'map@k': 0.04433747328099608}
n: 30, metric_values: {'recall@k': 0.10999208653481576, 'map@k': 0.04531793934351448}
n: 40, metric_values: {'recall@k': 0.1204941502858859, 'map@k': 0.0457975186124}
n: 50, metric_values: {'recall@k': 0.12928640572729708, 'map@k': 0.04612312566194038}
CPU times: user 2h 48min 37s, sys: 32min 30s, total: 3h 21min 7s
Wall time: 1h 48min 12s


In [None]:
candidates = get_candidates(30, model)
candidates.to_csv('als_candidates.csv', index=False)
candidates.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,0.413584,1
1,176549,10440,0.405836,2
2,176549,15297,0.361396,3
3,176549,7571,0.266977,4
4,176549,4151,0.250057,5


### Popular

Let's obtain scores and ranks for all items

In [None]:
k = items['item_id'].nunique()

In [None]:
candidates = popular_model.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=k, 
    filter_viewed=False,
)

In [None]:
candidates.to_csv('popular_candidates.csv', index=False)
candidates.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,10440,141889.0,1
1,176549,15297,137128.0,2
2,176549,13865,93403.0,3
3,176549,9728,76978.0,4
4,176549,4151,69641.0,5
