# 0. Configuration

In [1]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [2]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import numpy as np
import pandas as pd
from math import log2
import scipy.sparse as sp

from itertools import islice, cycle, product

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

## 1. 1. Helper functions to avoid copy paste

In [3]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [4]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [5]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [6]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [7]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['id'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


## 2.2 Data preparation using LightFM Dataset

To use implicit kNN method `fit` we need a sparse matrix in COOrdinate format. To achieve that we will use `scipy.sparse.coo_matrix` from scipy;


In [8]:
def get_coo_matrix(
        df: pd.DataFrame, 
        user_col: str,
        item_col: str, 
        users_mapping: dict, 
        movies_mapping: dict,
        weight_col: str = None
        ):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)
    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(movies_mapping.get)
        )
    ))
    return interaction_matrix


In [9]:
# define users mapping
users_inv_mapping = dict(enumerate(interactions_filtered['userId'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
len(users_mapping)


671

In [10]:
# define movies mapping
movies_inv_mapping = dict(enumerate(interactions_filtered['movieId'].unique()))
movies_mapping = {v: k for k, v in movies_inv_mapping.items()}
len(movies_mapping)


2830

In [11]:
# defining train set on the whole interactions dataset (as HW you will have to split into test and train for evaluation)
train_mat = get_coo_matrix(
    interactions_filtered,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()


In [12]:
train_mat

<671x2830 sparse matrix of type '<class 'numpy.float32'>'
	with 44989 stored elements in Compressed Sparse Row format>

## 2.3. Model Training & Evaluation

In [`implicit`](https://pypi.org/project/implicit/), there are various models and can be groupped into:
- Item-to-Item: KNN based on various similarities - CosineRecommender, BM25Recommender, TFIDFRecommender
- implicit ALS;
- Logistic Matrix Factorization;
- Bayesian Personalized Ranking (BPR)


### 2.3.1. Train Model

In [13]:
from implicit.nearest_neighbours import (
    CosineRecommender,
    BM25Recommender,
    TFIDFRecommender
    )


Note that in item-to-item models we need to provide matrix in the form of item-user by transposing initial COO matrix user-item


In [14]:
# fit the model
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat.T)


  0%|          | 0/671 [00:00<?, ?it/s]

### 2.3.2. Evaluate the Model

In [15]:
# let's make sense-check
top_N = 10
user_id = interactions_filtered['userId'].iloc[0]
row_id = users_mapping[user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 1, row number in matrix - 0


In [16]:
# create mapper for movieId and title names
movie_name_mapper = dict(zip(movies_metadata['id'], movies_metadata['original_title']))

In [17]:
recs = cosine_model.recommend(
    row_id,
    train_mat,
    N = top_N,
    filter_already_liked_items = True
    )
recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recs['inv_movie_id'] = recs['col_id'].astype(int)
recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
recs['title'] = recs['movieId'].map(movie_name_mapper)
recs


Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,653.0,0.861587,653,74458,Mere Brother Ki Dulhan
1,129.0,0.844531,129,1994,The Most Dangerous Game
2,606.0,0.654064,606,8011,Highlander III: The Sorcerer
3,294.0,0.625141,294,70,Million Dollar Baby
4,337.0,0.593856,337,170,28 Days Later
5,648.0,0.577499,648,68954,Longitude
6,579.0,0.571681,579,5956,Joshua
7,399.0,0.561442,399,1088,Whale Rider
8,278.0,0.561442,278,1584,School of Rock
9,150.0,0.557086,150,2100,The Last Castle


# TODO
- Make global train/ global test split -- train the model appropiately and predict on test set;
- Wrap up in function recommendations - lfm_recommend();
- Calculate `NDCG@10` on test set

## Task 1. 
* Make global train/ global test split -- train the model appropiately and predict on test set.

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
interactions_filtered.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
2605,15,132961,3.0,1458507347
76072,529,926,5.0,960086287
96117,640,260,5.0,860950973
5827,30,4146,1.0,1039070344
45366,321,1683,4.0,939420971


In [20]:
def train_test_splitting(df: pd.DataFrame, test_size: float, random_state: int = 42, user_col: str = 'userId'):
    '''
    df: pd.DataFrame for splitting
    test_size: size of the test sample (from 0 to 1)
    user_col: column for stratifying
    '''
    global_train, global_test = train_test_split(df, 
                                                 test_size=test_size, 
                                                 random_state=random_state,
                                                 stratify=df[user_col])

    return global_train, global_test

In [21]:
TEST_SIZE = .2
USER_COL = 'userId'

In [22]:
global_train, global_test = train_test_splitting(interactions_filtered, 
                                                 test_size=TEST_SIZE,
                                                 user_col=USER_COL)

In [23]:
lvl1_train, validation_set = train_test_splitting(global_train, 
                                                  test_size=TEST_SIZE,
                                                  user_col=USER_COL)

## Task 2.
* Wrap up in function recommendations - implicit_recommend().

In [24]:
def get_users_mapping(data: pd.DataFrame, user_col: str = 'userId', item_col: str = 'movieId'):
    
    # define users mapping
    users_inv_mapping = dict(enumerate(data[user_col].unique()))
    users_mapping = {v: k for k, v in users_inv_mapping.items()}
    
    return users_inv_mapping, users_mapping

def get_movies_mapping(data: pd.DataFrame, user_col: str = 'userId', item_col: str = 'movieId'):
    
    # define movies mapping
    movies_inv_mapping = dict(enumerate(data[item_col].unique()))
    movies_mapping = {v: k for k, v in movies_inv_mapping.items()}
    
    return movies_inv_mapping, movies_mapping

In [25]:
def data_preparation(data: pd.DataFrame, user_col: str = 'userId', item_col: str = 'movieId'):
    
    # define users mapping
    _, users_mapping = get_users_mapping(data=data, user_col=user_col, item_col=item_col)
    
    # define movies mapping
    _, movies_mapping = get_movies_mapping(data=data, user_col=user_col, item_col=item_col)
    
    # define sparse matrix
    matrix = get_coo_matrix(
        data,
        user_col = user_col,
        item_col = item_col,
        users_mapping = users_mapping,
        movies_mapping = movies_mapping
        ).tocsr()
    
    return matrix

In [26]:
def implicit_recommend(data: pd.DataFrame, model, user_id: int, users_mapping: dict, movies_inv_mapping: dict,
                       top_N: int = 10, filter_already_liked_items: bool = True):
    
    # let's make sense-check
    row_id = users_mapping[user_id] 
    recs = model.recommend(row_id, data, N = top_N, filter_already_liked_items = filter_already_liked_items)
    
    recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
    recs['inv_movie_id'] = recs['col_id'].astype(int)
    recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
    recs['title'] = recs['movieId'].map(movie_name_mapper)
    
    return recs

In [27]:
train_mat = data_preparation(lvl1_train)
validation_mat = data_preparation(validation_set)
test_mat = data_preparation(global_test)

In [28]:
# fit the model
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat.T)

  0%|          | 0/671 [00:00<?, ?it/s]

In [None]:
# define users mapping
_, users_mapping = get_users_mapping(lvl1_train)

# define movies mapping
movies_inv_mapping, _ = get_movies_mapping(lvl1_train)

recs = implicit_recommend(train_mat, cosine_model, 1, users_mapping, movies_inv_mapping)
recs

In [None]:
# define users mapping
_, users_mapping = get_users_mapping(validation_set)

recs = implicit_recommend(validation_mat, cosine_model, 1, users_mapping, movies_inv_mapping)
recs

In [None]:
train_mat = data_preparation(global_train)

In [None]:
# fit the model
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat.T)

In [None]:
# define users mapping
_, users_mapping = get_users_mapping(global_train)

# define movies mapping
movies_inv_mapping, _ = get_movies_mapping(global_train)

recs = implicit_recommend(train_mat, cosine_model, 1, users_mapping, movies_inv_mapping)
recs

In [None]:
# define users mapping
_, users_mapping = get_users_mapping(global_test)

recs = implicit_recommend(test_mat, cosine_model, 1, users_mapping, movies_inv_mapping)
recs

## Task 3.
* Calculate `NDCG@10` on test set.

In [None]:
def compute_gain(y_value: float, gain_scheme: str) -> float:
    
    gain = {'exp2': 2 ** y_value - 1,
            'const': y_value}

    return float(gain[gain_scheme])


In [None]:
def dcg(y_true: np.array, y_pred: np.array, gain_scheme: str, k: int) -> float:
    
    dcg = 0
    argsort = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[argsort]

    if k is not None:
        argsort = argsort[:k]
        y_true_sorted = y_true_sorted[:k]
        
    for idx, val in enumerate(y_true_sorted, 1):
        gain = compute_gain(val, gain_scheme)
        dcg += gain / log2(idx + 1)
        
    return dcg


In [None]:
def ndcg(y_true: np.array, ys_pred: np.array, gain_scheme: str = 'const', k: int = None) -> float:
    
    # pred dcg then we calc the same to find max possible
    preds_dcg = dcg(y_true, ys_pred, gain_scheme, k)
    max_possible_dcg = dcg(y_true, y_true, gain_scheme, k)

    return preds_dcg / max_possible_dcg


In [None]:
y_true = recs['similarity'].values
y_pred = np.array(y_true * 0)

`NDCG@10` for the first user.

In [None]:
ndcg(y_true, y_pred, 'exp2', 10)