In [1]:
# pip install catboost
# pip install -U torch
# pip install selenium

# 0. Configuration

In [2]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [3]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import random
import numpy as np
import pandas as pd
import datetime as dt
from itertools import permutations, chain

import torch
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


RANDOM_STATE = 42

## 1.1. Helper functions to avoid copy paste

In [4]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. RankNet

In [5]:
class RankNet(torch.nn.Module):
    def __init__(self, input_features_len, hidden_dim = 10):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_features_len, self.hidden_dim), # y = ax + b,
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )
        
        self.out_activation = torch.nn.Sigmoid() # this transformation equiavalent to RankNet formula

    def forward(self, input_1, input_2):
        logits_1 = self.predict(input_1)
        logits_2 = self.predict(input_2)
        
        logits_diff = logits_1 - logits_2
        out = self.out_activation(logits_diff)

        return out
    
    def predict(self, inp):
        logits = self.model(inp)
        return logits

In [6]:
model = RankNet(input_features_len = 8)
model

RankNet(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=1, bias=True)
  )
  (out_activation): Sigmoid()
)

In [7]:
input_1, input_2 = torch.rand(4, 8), torch.rand(4, 8)
input_2

tensor([[0.1416, 0.1595, 0.6878, 0.6527, 0.1364, 0.6421, 0.9856, 0.3914],
        [0.6234, 0.1256, 0.8645, 0.0542, 0.4770, 0.0501, 0.0797, 0.0486],
        [0.8362, 0.8594, 0.6269, 0.7115, 0.3312, 0.7281, 0.6260, 0.6810],
        [0.2511, 0.9560, 0.2472, 0.1402, 0.5264, 0.8363, 0.5836, 0.6499]])

In [8]:
input_1

tensor([[0.1974, 0.1278, 0.3878, 0.2634, 0.2498, 0.2429, 0.1142, 0.6649],
        [0.9958, 0.2707, 0.9122, 0.0067, 0.1628, 0.4796, 0.6739, 0.4640],
        [0.8498, 0.2913, 0.8699, 0.9376, 0.9358, 0.8015, 0.8049, 0.0130],
        [0.9117, 0.6116, 0.3862, 0.8582, 0.1365, 0.2872, 0.7022, 0.0552]])

$$P_{ij} \equiv P(U_{i}>U_{j}) \equiv \frac{1}{1 + \exp^{-\sigma(s_{i} - s{j})}}$$


In [9]:
preds = torch.sort(model(input_1, input_2), descending = True, dim = 0)
preds[0]

tensor([[0.5232],
        [0.4989],
        [0.4902],
        [0.4834]], grad_fn=<SortBackward>)

## 2.2. ListNet

In [10]:
movies_to_rank = {'The Godfather', 'Avatar', 'Ozark'}
permutations_list = list(permutations(movies_to_rank))

for i in permutations_list:
    print(i)

('The Godfather', 'Ozark', 'Avatar')
('The Godfather', 'Avatar', 'Ozark')
('Ozark', 'The Godfather', 'Avatar')
('Ozark', 'Avatar', 'The Godfather')
('Avatar', 'The Godfather', 'Ozark')
('Avatar', 'Ozark', 'The Godfather')


In [11]:
pi = random.choice(permutations_list)
print(pi)

('Avatar', 'The Godfather', 'Ozark')


In [12]:
np.random.seed(RANDOM_STATE)
scores_dict = {x: np.random.randn(1)[0] for x in movies_to_rank}  
print(scores_dict)

# unpack pi and assign movies to scores
score_movie_pos_1, score_movie_pos_2, score_movie_pos_3 = scores_dict[pi[0]], scores_dict[pi[1]], scores_dict[pi[2]]


{'The Godfather': 0.4967141530112327, 'Ozark': -0.13826430117118466, 'Avatar': 0.6476885381006925}


In [13]:
first_term = np.exp(score_movie_pos_1) / (np.exp(score_movie_pos_1) + np.exp(score_movie_pos_2)\
                                         + np.exp(score_movie_pos_3))

second_term = np.exp(score_movie_pos_2) / (np.exp(score_movie_pos_2) + np.exp(score_movie_pos_3))

third_term = np.exp(score_movie_pos_3) / np.exp(score_movie_pos_3)

print(f'First term is: {first_term}')
print(f'Second term is: {second_term}')
print(f'Third term is: {third_term}')

First term is: 0.4318619033836114
Second term is: 0.6536174529063914
Third term is: 1.0


$P_{s}(<The Godfather, Avatar, Ozark>) = \prod^3_{j = 1} \frac {\phi(s_{\pi(j)})} {\sum^3_{k = j} \phi(s_{\pi(k)})}$ which is equal to

In [14]:
permutation_proba = first_term * second_term * third_term

print(f'Permutation probability is: {permutation_proba}')


Permutation probability is: 0.2822724772969022


## 2.3. CatBoost Ranker

### 2.3.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [15]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [16]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [17]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [18]:
movies_metadata.rename(columns = {'id': 'movieId'}, inplace = True)

In [19]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['movieId'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


### 2.3.2 Data Preparation

In [20]:
TEST_SIZE = .25

In [21]:
# convert timestamp to date
interactions_filtered['dttm'] = interactions_filtered['timestamp']\
                                .apply(lambda x: pd.to_datetime(dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d')))

Let's calculate some basic features, but keep in mind that our data of historical ratings depends on time.
We need to avoid data leak -- use future values in past data

In [22]:
ITEM_FEATURES_LIST = ['revenue', 'budget', 'runtime']

In [23]:
# calculate avg ratings by users and items daily
daily_users_feature = interactions_filtered.groupby(['userId', 'dttm']).agg({'rating': 'mean',
                                              'movieId': 'count'})\
                                  .reset_index().sort_values(['userId', 'dttm'])\
                                  .rename(columns = {'rating': 'user_mean_rating',
                                                     'movieId': 'user_watch_count'})


daily_users_feature['dttm'] = daily_users_feature['dttm'].apply(lambda x: x + dt.timedelta(days = 1))
daily_users_feature.loc[daily_users_feature['userId'] == 3]

Unnamed: 0,userId,dttm,user_mean_rating,user_watch_count
2,3,2011-03-01,3.541667,24
3,3,2011-03-02,3.5,1


In [24]:
interactions_filtered.loc[interactions_filtered['userId'] == 3]

Unnamed: 0,userId,movieId,rating,timestamp,dttm
97,3,110,4.0,1298922049,2011-02-28
98,3,247,3.5,1298861637,2011-02-28
99,3,267,3.0,1298861761,2011-02-28
100,3,296,4.5,1298862418,2011-02-28
101,3,318,5.0,1298862121,2011-02-28
104,3,377,2.5,1298923242,2011-02-28
105,3,527,3.0,1298862528,2011-02-28
106,3,588,3.0,1298922100,2011-02-28
107,3,592,3.0,1298923247,2011-02-28
108,3,593,3.0,1298921840,2011-02-28


In [25]:
daily_users_feature[daily_users_feature.isna().any(axis=1)]

Unnamed: 0,userId,dttm,user_mean_rating,user_watch_count


In [26]:
cumulative_total_cnt = daily_users_feature.set_index('dttm').groupby(['userId'])['user_watch_count']\
                        .rolling(window = 3, min_periods = 1).sum()\
                        .reset_index()[['userId', 'dttm', 'user_watch_count']]\
                        .rename(columns = {'user_watch_count': 'user_total_watch_count_last_3_days'})

In [27]:
cumulative_total_cnt[cumulative_total_cnt.isna().any(axis=1)]

Unnamed: 0,userId,dttm,user_total_watch_count_last_3_days


In [28]:
# merge item features
main_df = pd.merge(
    interactions_filtered, movies_metadata[['movieId'] + ITEM_FEATURES_LIST],
    how = 'left', on = 'movieId'
                   ).drop_duplicates().reset_index(drop = True)
assert main_df.shape[0] == interactions_filtered.shape[0]

In [29]:
main_df[main_df.isna().any(axis=1)]

Unnamed: 0,userId,movieId,rating,timestamp,dttm,revenue,budget,runtime
11150,176,63276,3.5,1340916711,2012-06-28,0.0,0,
18943,299,63276,4.0,1344178309,2012-08-05,0.0,0,
31284,481,63179,4.5,1437004063,2015-07-16,0.0,0,
34653,531,61361,3.0,1240402775,2009-04-22,0.0,0,


In [30]:
movies_metadata[movies_metadata['movieId'].isin(list(set(main_df[main_df.isna().any(axis=1)]['movieId'].to_list())))]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
18916,False,,0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,61361,tt0339482,fr,L'outremangeur,,...,2002-09-10,0.0,,[],Released,,L'outremangeur,False,8.0,2.0
33003,False,,0,"[{'id': 18, 'name': 'Drama'}]",,63276,tt0140428,it,L'odore della notte,,...,1998-01-01,0.0,,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,,L'odore della notte,False,6.4,9.0
44959,False,"{'id': 421566, 'name': 'Totò Collection', 'pos...",0,"[{'id': 35, 'name': 'Comedy'}]",,63179,tt0043059,it,Totò Sceicco,,...,1950-11-30,0.0,,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,,Totò Sceicco,False,6.5,8.0


In [31]:
ind_runtime_pairs = zip(main_df[main_df.isna().any(axis=1)]['movieId'].index.to_list(), [90.0, 90.0, 105.0, 95.0])

for p in ind_runtime_pairs:
    main_df.loc[p[0], 'runtime'] = p[1]

In [32]:
main_df[main_df.isna().any(axis=1)]

Unnamed: 0,userId,movieId,rating,timestamp,dttm,revenue,budget,runtime


In [33]:
main_df = main_df.sort_values('dttm').reset_index(drop = True)
daily_users_feature = daily_users_feature.sort_values('dttm').reset_index(drop = True)
cumulative_total_cnt = cumulative_total_cnt.sort_values('dttm').reset_index(drop = True)

In [34]:
# merge user features with watch count
main_df = pd.merge_asof(
    main_df, daily_users_feature,
    on = 'dttm', by = 'userId',
    direction = 'backward',
    allow_exact_matches = True
    )
assert main_df.shape[0] == interactions_filtered.shape[0]

In [35]:
main_df[main_df.isna().any(axis=1)].head()

Unnamed: 0,userId,movieId,rating,timestamp,dttm,revenue,budget,runtime,user_mean_rating,user_watch_count
0,383,21,3.0,789652009,1995-01-09,0.0,0,95.0,,
1,224,427,3.0,828214011,1996-03-30,0.0,0,110.0,,
2,224,335,4.0,828214012,1996-03-30,5321508.0,5000000,175.0,,
3,224,337,4.0,828214012,1996-03-30,11576431.0,0,94.0,,
4,224,339,4.0,828214012,1996-03-30,2015810.0,3500000,129.0,,


In [36]:
main_df = pd.merge_asof(
    main_df, cumulative_total_cnt,
    on = 'dttm', by = 'userId',
    direction = 'backward',
    allow_exact_matches = True
    )
assert main_df.shape[0] == interactions_filtered.shape[0]

In [37]:
# tmp  = main_df.loc[main_df['userId'] == 671][['userId', 'dttm']]
# pd.merge_asof(
#     tmp.sort_values('dttm'), cumulative_total_cnt.sort_values('dttm'),
#     on = 'dttm', by = 'userId', direction = 'backward',
#     allow_exact_matches = True).sort_values('dttm')

In [38]:
# anyway we left some NaN
main_df.isnull().sum() / len(main_df) 

userId                                0.000000
movieId                               0.000000
rating                                0.000000
timestamp                             0.000000
dttm                                  0.000000
revenue                               0.000000
budget                                0.000000
runtime                               0.000000
user_mean_rating                      0.616729
user_watch_count                      0.616729
user_total_watch_count_last_3_days    0.616729
dtype: float64

In [39]:
FINAL_FEATURES_LIST = ['revenue', 'budget', 'runtime', 'user_mean_rating',
                       'user_watch_count', 'user_total_watch_count_last_3_days']

In [40]:
ID_COLS = ['userId', 'movieId']

In [41]:
TARGET = 'rating'

In [42]:
X = main_df[ID_COLS + FINAL_FEATURES_LIST]
y = main_df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = TEST_SIZE,
    random_state = RANDOM_STATE)

print(f'Shape of train set X, y: {X_train.shape}, {len(y_train)}')
print(f'Shape of train set X, y: {X_test.shape}, {len(y_test)}')

Shape of train set X, y: (33741, 8), 33741
Shape of train set X, y: (11248, 8), 11248


### 2.3.3. Train Model

In [43]:
# init model
model = CatBoostRegressor(
    loss_function = 'MAE',
    iterations = 2000,
    learning_rate = .1,
    depth = 6,
    verbose = False
)

In [44]:
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds = 20 # to avoid overfitting,
)

<catboost.core.CatBoostRegressor at 0x1754f27c910>

In [45]:
model.best_score_

{'learn': {'MAE': 0.669018848903783},
 'validation': {'MAE': 0.7302817897495212}}

# TODO
- Add baseline comparison from the model (well, we discussed what is baseline for MAE metric -- now, you have to define, how you are going to calculate it)

Baseline regression models includes mean or median, therefore, the average and median values were calculated and entered into the formula instead of y_pred.

In [46]:
from sklearn.metrics import mean_absolute_error

In [47]:
y_pred = model.predict(X_test)

y_baseline_mean = np.full((len(y_test), ), y_test.mean()).tolist()
y_baseline_median = np.full((len(y_test), ), y_test.median()).tolist()

mae_model = mean_absolute_error(y_test, y_pred)
mae_baseline_mean = mean_absolute_error(y_test, y_baseline_mean)
mae_baseline_median = mean_absolute_error(y_test, y_baseline_median)

print(f'Baseline MAE (mean): {mae_baseline_mean}, Model MAE: {mae_model} (ABS difference: {abs(mae_model - mae_baseline_mean)})')
print(f'Baseline MAE (median): {mae_baseline_median}, Model MAE: {mae_model} (ABS difference: {abs(mae_model - mae_baseline_median)})')

Baseline MAE (mean): 0.8602094932255382, Model MAE: 0.7302827897495214 (ABS difference: 0.12992670347601676)
Baseline MAE (median): 0.8440167140825036, Model MAE: 0.7302827897495214 (ABS difference: 0.11373392433298213)


In [48]:
def fill_na(df: pd.DataFrame, column: str):
    df[column] = df[column].fillna(df[column].mean())
    return df

In [49]:
for col in ['user_mean_rating', 'user_watch_count', 'user_total_watch_count_last_3_days']:
    X_train = fill_na(X_train, col)
    X_test = fill_na(X_test, col)

In [50]:
X_train[X_train.isna().any(axis=1)]

Unnamed: 0,userId,movieId,revenue,budget,runtime,user_mean_rating,user_watch_count,user_total_watch_count_last_3_days


In [51]:
X_test[X_test.isna().any(axis=1)]

Unnamed: 0,userId,movieId,revenue,budget,runtime,user_mean_rating,user_watch_count,user_total_watch_count_last_3_days


In [52]:
from sklearn.linear_model import LinearRegression

In [53]:
# Create linear regression object
regr = LinearRegression()

# Train the model using the train set
regr.fit(X_train, y_train)

# Make predictions using the test set
y_baseline_regr = regr.predict(X_test)

In [54]:
mae_model = mean_absolute_error(y_test, y_pred)
mae_baseline_regr = mean_absolute_error(y_test, y_baseline_regr)

print(f'Baseline MAE (linear regression): {mae_baseline_regr}, Model MAE: {mae_model} (ABS difference: {abs(mae_model - mae_baseline_regr)})')

Baseline MAE (linear regression): 0.8442452358856042, Model MAE: 0.7302827897495214 (ABS difference: 0.11396244613608275)
