In [1]:
# pip install --upgrade gensim

In [2]:
# import gensim
# gensim.__version__

In [3]:
# pip install --upgrade scikit-learn

# 0. Configuration

In [4]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
KEYWORDS_URL = 'https://drive.google.com/file/d/16TV3-KX9EYCcEcajvTh8MJqOij6EF67W/view?usp=share_link'
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [5]:
import re
import nltk
import collections
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from ast import literal_eval
from pymystem3 import Mystem
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import umap.umap_ as umap

import warnings
warnings.filterwarnings('ignore')

# download stop words beforehand
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1.1. Helper functions to avoid copypaste

In [6]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [7]:
# init lemmatizer to avoid slow performance
mystem = Mystem() 

def word_tokenize_clean(doc: str, stop_words: list):
    '''
    tokenize from string to list of words
    '''

    # split into lower case word tokens \w lemmatization
    tokens = list(set(mystem.lemmatize(doc.lower())))
  
    # remove tokens that are not alphabetic (including punctuation) and not a stop word
    tokens = [word for word in tokens if word.isalpha() and not word in stop_words \
              not in list(punctuation)]
    return tokens

In [8]:
def additional_metadata(df: pd.DataFrame, id_row: str, target_row: str):
    metadata_prep = {}
    for index, row in df.iterrows():
        metadata_prep[row[id_row]] = ', '.join([v.get('name') for v in eval(row[target_row])])
    output = pd.DataFrame({id_row: metadata_prep.keys(), target_row + '_prep': metadata_prep.values()}, dtype=str)
    return output

# 2. Main

## 2.1. Data Preparation

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [9]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [10]:
# read csv information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [11]:
# let's see what columns we have
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

`keywords` dataset shows the list of movies keywords existing on OKKO platform:

In [12]:
# keywords data
keywords = read_csv_from_gdrive(KEYWORDS_URL)
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


To get accurate results we need to preprocess text a bit. The pipeline will be as follows:

- Filter only necessary columns from movies_metadada : id, original_title, overview;
- Define `model_index` for model to match back with `id` column;
- Text cleaning: removing stopwords & punctuation, lemmatization for further tokenization and tagged document creatin required for gensim.Doc2Vec

In [13]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'overview']].copy()
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [14]:
# as you see from above, we have missing overview in some cases -- let's fill it with the original title
sample.loc[sample['overview'].isnull(), 'overview'] = sample.loc[sample['overview'].isnull(), 'original_title']
sample.isnull().sum()

id                0
original_title    0
overview          0
dtype: int64

In [15]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [16]:
# create mapper with title and model_idnex to use it further in evaluation
movies_inv_mapper = dict(zip(sample['original_title'].str.lower(), sample['model_index'].astype(int)))

In [17]:
len(movies_inv_mapper) #???

43330

In [18]:
name_mapper = dict(zip(sample['model_index'].astype(int), sample['original_title'].str.lower()))

In [19]:
print([item for item, count in collections.Counter(sample['original_title'].to_list()).items() if count > 1])



In [20]:
movies_metadata[movies_metadata['original_title'] == 'It Takes Two']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
37,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",,33689,tt0113442,en,It Takes Two,Identical 9-year-olds from very different back...,...,1995-11-17,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Two identical strangers. Two different worlds....,It Takes Two,False,6.1,149.0
29129,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,110538,tt0095384,en,It Takes Two,Travis is due to marry Stephanie in a few days...,...,1988-07-13,0.0,78.0,[],Released,,It Takes Two,False,5.4,5.0


In [21]:
# preprocess by removing non-character data, stopwords
tags_corpus = sample['overview'].values
tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
stop_words = stopwords.words('english')

tags_doc = [word_tokenize_clean(description, stop_words) for description in tags_corpus]
tags_corpus[:1]

["Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."]

In [22]:
# prepare data as model input for Word2Vec
## it takes some time to execute
tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

In [23]:
# let's check what do we have
## tag = movie index
tags_doc[1]

TaggedDocument(words=['room', 'discover', 'find', 'judy', 'trapped', 'rhinoceroses', 'monkeys', 'siblings', 'unwittingly', 'years', 'magical', 'door', 'running', 'inside', 'living', 'opens', 'risky', 'giant', 'peter', 'adult', 'finish', 'invite', 'game', 'alan', 'board', 'hope', 'freedom', 'proves', 'evil', 'enchanted', 'terrifying', 'creatures', 'three', 'world'], tags=['1'])

# 2.2. Model Training and Evaluation

In [24]:
VEC_SIZE = 50
ALPHA = .02
MIN_ALPHA = .00025
MIN_COUNT = 5
EPOCHS = 20

In [25]:
# initialize
model = Doc2Vec(vector_size = VEC_SIZE,
                alpha = ALPHA, 
                min_alpha = MIN_ALPHA,
                min_count = MIN_COUNT,
                dm = 0)

In [26]:
# generate vocab from all tag docs
model.build_vocab(tags_doc)

In [27]:
# train model
model.train(tags_doc,
            total_examples = model.corpus_count,
            epochs = EPOCHS)

## 2.3. Evaluate the model

Let's assume that we watched movie `batman` and based on that generate recommendation similar to it's description.

To do that we need
- To extract movie id from `movies_inv_mapper` we created to map back titles from model output
- Load embeddings from trained model
- Use built-in most_similar() method to get most relevant recommendations based on film embedding
- Finally, map title names for sense-check

In [28]:
# get id
movie_id = [k for k, v in name_mapper.items() if v == 'batman']
movie_id
# :)

[585, 8603]

In [29]:
movies_metadata[movies_metadata['original_title'] == 'Batman']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
585,False,"{'id': 120794, 'name': 'Batman Collection', 'p...",35000000,"[{'id': 14, 'name': 'Fantasy'}, {'id': 28, 'na...",,268,tt0096895,en,Batman,The Dark Knight of Gotham City begins his war ...,...,1989-06-23,411348924.0,126.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Have you ever danced with the devil in the pal...,Batman,False,7.0,2145.0
8603,False,,1377800,"[{'id': 10751, 'name': 'Family'}, {'id': 12, '...",,2661,tt0060153,en,Batman,The Dynamic Duo faces four super-villains who ...,...,1966-07-30,0.0,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,He's Here Big As Life In A Real Bat-Epic,Batman,False,6.1,209.0


In [30]:
movie_id = movie_id[1]

In [31]:
# load trained embeddings 
movies_vectors = model.dv.vectors

In [32]:
movie_embeddings = movies_vectors[movie_id]

In [33]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,8603,1.0
1,7772,0.961806
2,5713,0.961601
3,2053,0.956457
4,44366,0.954832


In [34]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output

Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,7772,0.961806,this island earth
2,5713,0.961601,rollover
3,2053,0.956457,the neverending story ii: the next chapter
4,44366,0.954832,"abraxas, guardian of the universe"
5,28001,0.954607,reach me
6,43165,0.952945,the zookeeper's wife
7,44262,0.951588,quest of the delta knights
8,26340,0.950922,the siege of firebase gloria
9,44339,0.950405,the underground world


# TODO

- Add `original_title`, `keywords`, `tagline` and other metadata to train sample and then retrain embeddings;
- Make visualization of embeddings with links of films with each other;
- Compare results with the embeddings we created in lecture
- Write function get_recommendations() which takes arguments we used 2.3., but such that we can use embeddings of several watched films to get recommendations

## Task 1.
* Add original_title, keywords, tagline and other metadata to train sample and then retrain embedding.

In [35]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'title', 'overview', 'tagline']].copy()

In [36]:
sample = pd.merge(sample, additional_metadata(df=keywords, id_row='id', target_row='keywords'), how='left', on='id')
sample.head()

Unnamed: 0,id,original_title,title,overview,tagline,keywords_prep
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,"jealousy, toy, boy, friendship, friends, rival..."
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,"board game, disappearance, based on children's..."
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"fishing, best friend, duringcreditsstinger, ol..."
3,31357,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,"based on novel, interracial relationship, sing..."
4,11862,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"baby, midlife crisis, confidence, aging, daugh..."


In [37]:
sample = pd.merge(sample, additional_metadata(df=movies_metadata, id_row='id', target_row='genres'), how='left', on='id')
sample.head()

Unnamed: 0,id,original_title,title,overview,tagline,keywords_prep,genres_prep
0,862,Toy Story,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,"jealousy, toy, boy, friendship, friends, rival...","Animation, Comedy, Family"
1,8844,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,"board game, disappearance, based on children's...","Adventure, Fantasy, Family"
2,15602,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"fishing, best friend, duringcreditsstinger, ol...","Romance, Comedy"
3,31357,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,"based on novel, interracial relationship, sing...","Comedy, Drama, Romance"
4,11862,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,"baby, midlife crisis, confidence, aging, daugh...",Comedy


In [38]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45466 entries, 0 to 45465
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   title           45460 non-null  object
 3   overview        44512 non-null  object
 4   tagline         20412 non-null  object
 5   keywords_prep   45462 non-null  object
 6   genres_prep     45466 non-null  object
dtypes: object(7)
memory usage: 2.8+ MB


In [39]:
sample[['overview', 'tagline', 'keywords_prep']] = sample[['overview', 'tagline', 'keywords_prep']].fillna(' ')
sample.loc[sample['title'].isnull(), 'title'] = sample.loc[sample['title'].isnull(), 'original_title']
sample.isnull().sum()

id                0
original_title    0
title             0
overview          0
tagline           0
keywords_prep     0
genres_prep       0
dtype: int64

In [40]:
sample['merged_columns'] = sample['title'] + '. ' + sample['overview'] + '. ' + sample['tagline'] + '. ' + sample['keywords_prep'] + '. ' + sample['genres_prep']

In [41]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [42]:
name_mapper = dict(zip(sample['model_index'].astype(int), sample['original_title'].str.lower()))

In [43]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_col = 'tag'):
    '''text preprocessing
    '''
    tags_corpus = agg_tags[text_col].values
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')

    # preprocess corpus of movie tags before feeding it into Doc2Vec model
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc

In [44]:
# let's check what do we have
## tag = movie index
tags_doc = get_clean_tags_array(sample, 'merged_columns')
tags_doc[1]

TaggedDocument(words=['children', 'find', 'trapped', 'adventure', 'monkeys', 'years', 'book', 'magical', 'home', 'living', 'risky', 'giant', 'invite', 'alan', 'hope', 'fantasy', 'room', 'dice', 'recluse', 'siblings', 'opens', 'freedom', 'creatures', 'roll', 'discover', 'rhinoceroses', 'based', 'running', 'jumanji', 'adult', 'unleash', 'evil', 'terrifying', 'three', 'world', 'family', 'judy', 'unwittingly', 'insect', 'excitement', 'door', 'inside', 'new', 'disappearance', 'peter', 'finish', 'game', 'proves', 'board', 'enchanted'], tags=['1'])

In [45]:
def train_embeddings(tags_doc: np.array,
                     epochs: int = 20,
                     vec_size: int = 50,
                     alpha: float = .02,
                     min_alpha: float =  0.00025,
                     min_count: int = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model

In [46]:
model = train_embeddings(tags_doc)

In [47]:
# get id
movie_id = [k for k, v in name_mapper.items() if v == 'batman']
movie_id
# :)

[585, 8603]

In [48]:
movie_id = movie_id[1]

In [49]:
# load trained embeddings 
movies_vectors = model.dv.vectors

In [50]:
movie_embeddings = movies_vectors[movie_id]

In [51]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,8603,1.0
1,21124,0.90944
2,19114,0.907853
3,35983,0.9053
4,16279,0.904289


In [52]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output

Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,21124,0.90944,lego batman: the movie - dc super heroes unite
2,19114,0.907853,doragon bōru zetto: gekitotsu!! hyaku-oku pawā...
3,35983,0.9053,batman: bad blood
4,16279,0.904289,tron: legacy
5,42756,0.903131,scavengers
6,24545,0.903024,lego hero factory: savage planet
7,21679,0.898712,電人ザボーガー
8,6417,0.898706,spy kids 3-d: game over
9,13635,0.898611,x-men origins: wolverine


## Task 2.
* Make visualization of embeddings with links of films with each other.

In [53]:
from sklearn.manifold import TSNE

In [54]:
tsne = TSNE(n_components=2, verbose=1, random_state=123)
XY2 = tsne.fit_transform(movies_vectors)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 45466 samples in 0.002s...
[t-SNE] Computed neighbors for 45466 samples in 19.392s...
[t-SNE] Computed conditional probabilities for sample 1000 / 45466
[t-SNE] Computed conditional probabilities for sample 2000 / 45466
[t-SNE] Computed conditional probabilities for sample 3000 / 45466
[t-SNE] Computed conditional probabilities for sample 4000 / 45466
[t-SNE] Computed conditional probabilities for sample 5000 / 45466
[t-SNE] Computed conditional probabilities for sample 6000 / 45466
[t-SNE] Computed conditional probabilities for sample 7000 / 45466
[t-SNE] Computed conditional probabilities for sample 8000 / 45466
[t-SNE] Computed conditional probabilities for sample 9000 / 45466
[t-SNE] Computed conditional probabilities for sample 10000 / 45466
[t-SNE] Computed conditional probabilities for sample 11000 / 45466
[t-SNE] Computed conditional probabilities for sample 12000 / 45466
[t-SNE] Computed conditional probabilities for sa

In [55]:
import plotly.graph_objects as go

In [56]:
fig = go.Figure(data=go.Scatter(
    x = XY2[:, 0],
    y = XY2[:, 1],
    mode='markers',
    hovertemplate = '<b>ID:</b> ' + movies_metadata['id'] + '<br><b>Original title:</b> ' + movies_metadata['original_title'] + '<br><b>Genres:</b> ' + movies_metadata['genres'] + '<extra></extra>'
))

fig.update_layout(
    title='2D Doc2Vec Embedding t-SNE Plot',
    width=1000,
    height=1000
)

fig.show()

In [None]:
tsne = TSNE(n_components=3, verbose=1, random_state=123)
XY3 = tsne.fit_transform(movies_vectors)

In [None]:
fig = go.Figure(data=[go.Scatter3d(
    x = XY3[:, 0], 
    y = XY3[:, 1], 
    z = XY3[:, 2],
    mode='markers',
    hovertemplate = '<b>ID:</b> ' + movies_metadata['id'] + '<br><b>Original title:</b> ' + movies_metadata['original_title'] + '<br><b>Genres:</b> ' + movies_metadata['genres'] + '<extra></extra>'
)])

fig.update_layout(
    title='3D Doc2Vec Embedding t-SNE Plot',
    width=1000,
    height=1000
)

fig.show()

## Task 4.
* Write function get_recommendations() which takes arguments we used 2.3., but such that we can use embeddings of several watched films to get recommendations (for one metadata).

In [57]:
interactions['rating'].describe()['max']

5.0

In [58]:
def get_recommendatios(
    sample: pd.DataFrame,
    movie_title_col: str,
    metadata: str,
    users_watched: pd.DataFrame = None,
    user_id: int = None,
    user_col: str = None,
    item_col: str = None,
    watched_films: list = None):

    if users_watched is not None and user_id is None:
        raise ValueError('Put "user_id" as int parameter.')
    elif users_watched is not None and user_col is None:
        raise ValueError('Put "user_col" as str parameter.')
    elif users_watched is not None and item_col is None:
        raise ValueError('Put "item_col" as str parameter.')
    elif users_watched is not None and len(users_watched.columns) < 3:
        raise ValueError('Provide a table with the movies viewed and the user ratings on them in the format \
                          pd.DataFrame with columns ["userId", "movieId", "rating"].')
    elif type(watched_films) is not str and type(watched_films) is not list and users_watched is None:
        raise ValueError('"watched_films" should be list (or str if one movie is passed to the function) type. \
                           Or provide a table with the movies viewed and the user ratings on them in the format \
                           pd.DataFrame with columns ["userId", "movieId", "rating"].')
    if movie_title_col not in sample.columns:
        raise ValueError('Put parameter "movie_title_col" as column in sample DataFrame.')

    if type(watched_films) is str:
        watched_films = [watched_films]
    if users_watched is not None and user_id is not None:
        users = users_watched[[user_col]].drop_duplicates().reset_index(drop=True)
        users_watched = users_watched[users_watched['rating'] >= 3] # хотя можно было сделать, чтобы выбирался
        users['watched'] = users_watched.groupby(user_col)[item_col].apply(list).to_dict().values()
        movie_ids_from_data = users[users[user_col] == user_id]['watched'].values[0]
        if len(movie_ids_from_data) < 1 and watched_films is None:
            raise ValueError('Change rating threshold or add "watched_films".') #хотя можно добавить базовые рекоммендации from baseline

    # define model_index and make it as string
    sample = sample.reset_index().rename(columns = {'index': 'model_index'})
    sample['model_index'] = sample['model_index'].astype(str)

    if watched_films is not None:
        name_mapper = dict(zip(sample['model_index'].astype(int), sample[movie_title_col].str.lower()))
        movie_ids = []
        for wm in watched_films:
            movie_id = [k for k, v in name_mapper.items() if v == wm.lower()]
            if len(movie_id) > 1:
                for i in movie_id:
                    movie_ids.append(i)
            else:
                movie_ids.append(*movie_id)

    if watched_films is not None and users_watched is not None:
        movie_ids = movie_ids + [int(x) for x in movie_ids_from_data]
    
    if type(metadata) is str:
        tags_doc = get_clean_tags_array(sample, metadata)
    elif type(metadata) is list:
        cols_for_merge = [c for c in sample.columns.to_list() if c in metadata]
        sample['merged_columns'] = sample[cols_for_merge].apply(lambda x: '. '.join(x.dropna().astype(str)), axis=1)
        tags_doc = get_clean_tags_array(sample, 'merged_columns')

    model = train_embeddings(tags_doc)

    # load trained embeddings 
    movies_vectors = model.dv.vectors
    movie_embeddings = np.mean(movies_vectors[movie_ids], axis=0)

    # get recommendations
    similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
    output = pd.DataFrame(similars, columns=['model_index', 'model_score'])

    for ids in movie_ids:
        output = output[output['model_index'] != ids]
    output['title_name'] = output['model_index'].astype(int).map(name_mapper)

    return output

In [59]:
output = get_recommendatios(sample=sample[['original_title', 'overview']],
                            movie_title_col='original_title',
                            metadata='overview',
                            watched_films=['batman', 'captain america'])

In [60]:
output

Unnamed: 0,model_index,model_score,title_name
0,18119,0.975973,бумажный солдат
1,3657,0.974396,project moon base
2,27477,0.974103,mortdecai
3,34858,0.973836,wonder woman
4,344,0.973778,clear and present danger
5,15924,0.973707,countdown
6,13585,0.973021,zoku sugata sanshiro
7,5363,0.972692,our man flint
8,15413,0.972121,wing and a prayer
9,21402,0.971993,creature with the atom brain


In [61]:
# align data in both dataframes to merge
interactions['movieId'] = interactions['movieId'].astype(str)
movies_metadata.rename(columns = {'id': 'movieId'}, inplace=True)

In [62]:
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['movieId'])]

In [63]:
data = pd.merge(interactions_filtered, movies_metadata[['movieId', 'original_title']].drop_duplicates(), how='left', on='movieId')

In [64]:
ITEM_COLUMN = 'movieId'
USER_COLUMN = 'userId'

In [65]:
output = get_recommendatios(sample=sample[['original_title', 'overview']],
                            metadata='overview',
                            movie_title_col='original_title',
                            users_watched=data,
                            user_id=246,
                            user_col=USER_COLUMN,
                            item_col=ITEM_COLUMN,
                            watched_films=['batman', 'captain america'])

In [66]:
output

Unnamed: 0,model_index,model_score,title_name
0,17097,0.989597,dead kids
1,10531,0.989486,night of the lepus
2,34895,0.988714,recep i̇vedik
3,11883,0.988635,l'ivresse du pouvoir
4,34515,0.988308,cosmic scrat-tastrophe
5,30850,0.988169,aloha scooby-doo!
6,20618,0.987603,outlaw of gor
7,44665,0.9876,entre calais et douvres
8,28253,0.98757,green ice
9,8703,0.987544,emperor of the north pole


In [67]:
output = get_recommendatios(sample=sample[['original_title', 'overview', 'title', 'tagline', 'keywords_prep', 'genres_prep']],
                            metadata='overview',
                            movie_title_col='original_title',
                            users_watched=data,
                            user_id=246,
                            user_col=USER_COLUMN,
                            item_col=ITEM_COLUMN,
                            watched_films=['batman', 'captain america'])

In [68]:
output

Unnamed: 0,model_index,model_score,title_name
0,44801,0.98985,don't hug me i'm scared 5
1,28098,0.989703,le avventure dell'incredibile ercole
2,14505,0.989491,munyurangabo
3,32578,0.989185,peter pan
4,10531,0.988791,night of the lepus
5,16355,0.988538,the tiger's tail
6,23844,0.988421,sexual life
7,44251,0.988196,riding with death
8,34515,0.988189,cosmic scrat-tastrophe
9,44158,0.987788,tikhaya zastava


# Appendix

Here, we wrap up all pipeline into functions to re-use if needed and it is just prettier to code this way :)

In [None]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_col = 'tag'):
    '''text preprocessing
    '''
    tags_corpus = agg_tags[text_col].values
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')

    # preprocess corpus of movie tags before feeding it into Doc2Vec model
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc

In [None]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model