In [1]:
import os

In [2]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

# LOAD DATA 

In [6]:
# download dataset by chunks
import requests
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:  97%|█████████▋| 76.5M/78.8M [00:04<00:00, 21.1MiB/s]

In [7]:
!unzip kion_train.zip

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [8]:
%%time
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

CPU times: user 3.63 s, sys: 344 ms, total: 3.97 s
Wall time: 5.32 s


# Preprocess

In [9]:
Columns.Datetime = 'last_watch_dt'

In [10]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

In [11]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [12]:
max_date = interactions[Columns.Datetime].max()

In [13]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [14]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

kion dataset download: 100%|██████████| 78.8M/78.8M [00:19<00:00, 21.1MiB/s]

train: (4985269, 6)
test: (490982, 6)


In [15]:
train = interactions.copy()
print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (5476251, 6)
test: (490982, 6)


In [16]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [17]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [18]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Prepare features

## User features

In [19]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [20]:
users.fillna('Unknown', inplace=True)

In [21]:
users.nunique()

user_id     840197
age              7
income           7
sex              3
kids_flg         2
dtype: int64

In [22]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [23]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840184,529394,age_25_34,income_40_60,Ж,0
840186,80113,age_25_34,income_40_60,Ж,0
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1


In [24]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [25]:
user_features.query(f"id == 973171")

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age
0,973171,income_60_90,income


# Item features

In [26]:
items.isnull().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [27]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [28]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [29]:
items.nunique()

item_id         14163
content_type        2
title           13582
title_orig       9797
release_year      104
genres           2575
countries         669
for_kids            2
age_rating          6
studios            38
directors        7468
actors          11942
description     13929
keywords        13706
dtype: int64

### Genre

In [30]:
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [31]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

In [32]:
content_feature

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [33]:
item_features = pd.concat((genre_feature, content_feature))

In [34]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# Metrics

In [35]:
metrics_name = {
#     'Precision': Precision,
#     'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(10, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [36]:
metrics

{'MAP@10': MAP(k=10, divide_by_k=False)}

# Models

In [37]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = [16, 32]
N_FACTORS = (32, 64)
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = [0.05, 0.1] # Lightfm

In [38]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 2.01 s, sys: 62.1 ms, total: 2.07 s
Wall time: 2.08 s


In [39]:
TEST_USERS = test[Columns.User].unique()

In [40]:
model = LightFMWrapperModel(
                LightFM(
                    no_components=64, 
                    loss='warp', 
                    random_state=RANDOM_STATE,
                    learning_rate=0.05,
                    user_alpha=USER_ALPHA,
                    item_alpha=ITEM_ALPHA,
                ),
                epochs=1,
                num_threads=16,
            )

In [8]:
# old model
#  import pickle

# model = pickle.load(open('/content/LightFMWrapperModel_w.pickle', 'rb'))

In [41]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fbd6603c640>

In [42]:
recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

calc_metrics(metrics, recos, test, train)

{'MAP@10': 0.010535155003694265}

In [43]:
uniq_users = train.user_id.unique()
len(uniq_users)

809577

In [44]:
r = model.recommend(
          uniq_users,
          dataset=dataset,
          k=K_RECOS,
          filter_viewed=True,
      )

In [45]:
r

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,-205.902826,1
1,176549,10440,-206.083098,2
2,176549,3734,-206.097284,3
3,176549,7626,-206.269442,4
4,176549,12173,-206.303773,5
...,...,...,...,...
8095765,697262,3734,-146.388407,6
8095766,697262,4880,-146.604653,7
8095767,697262,2657,-146.669451,8
8095768,697262,4740,-146.699518,9


In [47]:
recos_all_users = r.groupby("user_id").agg({"item_id": list}).to_dict()

In [51]:
import pickle

pickle.dump(recos_all_users, open("recos_all_users.pickle",'wb'))

In [53]:
recos_all_users = pickle.load(open("recos_all_users.pickle",'rb'))
# recos_all_users