<a href="https://colab.research.google.com/github/nmarkin/Rec-Sys-Okko/blob/main/doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Configuration

In [1]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

In [2]:
!pip install gensim --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install umap-learn[plot]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 1. Modules and functions

In [4]:
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from ast import literal_eval
from pymystem3 import Mystem
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import warnings
warnings.filterwarnings('ignore')

# download stop words beforehand
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1.1. Helper functions to avoid copypaste

In [5]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

In [6]:
# init lemmatizer to avoid slow performance
mystem = Mystem() 

def word_tokenize_clean(doc: str, stop_words: list):
    '''
    tokenize from string to list of words
    '''

    # split into lower case word tokens \w lemmatization
    tokens = list(set(mystem.lemmatize(doc.lower())))
  
    # remove tokens that are not alphabetic (including punctuation) and not a stop word
    tokens = [word for word in tokens if word.isalpha() and not word in stop_words \
              not in list(punctuation)]
    return tokens

# 2. Main

## 2.1. Data Preparation

In [7]:
# read csv information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [8]:
# let's see what columns we have
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

To get accurate results we need to preprocess text a bit. The pipeline will be as follows:

- Filter only necessary columns from movies_metadada : id, original_title, overview;
- Define `model_index` for model to match back with `id` column;
- Text cleaning: removing stopwords & punctuation, lemmatization for further tokenization and tagged document creatin required for gensim.Doc2Vec

In [9]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'overview']].copy()
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45466 non-null  object
 1   original_title  45466 non-null  object
 2   overview        44512 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [10]:
# as you see from above, we have missing overview in some cases -- let's fill it with the original title
sample.loc[sample['overview'].isnull(), 'overview'] = sample.loc[sample['overview'].isnull(), 'original_title']
sample.isnull().sum()

id                0
original_title    0
overview          0
dtype: int64

In [11]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [12]:
# create mapper with title and model_idnex to use it further in evaluation
movies_inv_mapper = dict(zip(sample['original_title'].str.lower(), sample['model_index'].astype(int)))

In [13]:
# preprocess by removing non-character data, stopwords
tags_corpus = sample['overview'].values
tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
stop_words = stopwords.words('english')

tags_doc = [word_tokenize_clean(description, stop_words) for description in tags_corpus]
tags_corpus[:1]

["Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."]

In [14]:
# prepare data as model input for Word2Vec
## it takes some time to execute
tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

In [15]:
# let's check what do we have
## tag = movie index
tags_doc[1]

TaggedDocument(words=['door', 'finish', 'creatures', 'three', 'siblings', 'world', 'peter', 'trapped', 'magical', 'risky', 'monkeys', 'freedom', 'years', 'discover', 'board', 'unwittingly', 'proves', 'room', 'invite', 'terrifying', 'rhinoceroses', 'giant', 'alan', 'opens', 'living', 'evil', 'adult', 'inside', 'game', 'find', 'enchanted', 'judy', 'hope', 'running'], tags=['1'])

In [16]:
sample

Unnamed: 0,model_index,id,original_title,overview
0,0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...,...
45461,45461,439050,رگ خواب,Rising and falling between a man and woman.
45462,45462,111109,Siglo ng Pagluluwal,An artist struggles to finish his work while a...
45463,45463,67758,Betrayal,"When one of her hits goes wrong, a professiona..."
45464,45464,227506,Satana likuyushchiy,"In a small town live two brothers, one a minis..."


# 2.2. Model Training and Evaluation

In [17]:
VEC_SIZE = 50
ALPHA = .02
MIN_ALPHA = .00025
MIN_COUNT = 5
EPOCHS = 20

In [18]:
# initialize
model = Doc2Vec(vector_size = VEC_SIZE,
                alpha = ALPHA, 
                min_alpha = MIN_ALPHA,
                min_count = MIN_COUNT,
                dm = 0)

In [19]:
# generate vocab from all tag docs
model.build_vocab(tags_doc)

In [20]:
# train model
model.train(tags_doc,
            total_examples = model.corpus_count,
            epochs = EPOCHS)

## 2.3. Evaluate the model

Let's assume that we watched movie `batman` and based on that generate recommendation similar to it's description.

To do that we need
- To extract movie id from `movies_inv_mapper` we created to map back titles from model output
- Load embeddings from trained model
- Use built-in most_similar() method to get most relevant recommendations based on film embedding
- Finally, map title names for sense-check

In [21]:
# get id
movie_id = movies_inv_mapper['batman']
movie_id

8603

In [22]:
# load trained embeddings 
movies_vectors = model.dv.vectors

In [23]:
movie_embeddings = movies_vectors[movie_id]

In [24]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,8603,1.0
1,13835,0.963661
2,5713,0.961881
3,8916,0.957187
4,43165,0.953609


In [25]:
# reverse values and indices to map names in dataframe
name_mapper = {v: k for k, v in movies_inv_mapper.items()}

In [26]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output


Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,13835,0.963661,k2
2,5713,0.961881,rollover
3,8916,0.957187,killer klowns from outer space
4,43165,0.953609,the zookeeper's wife
5,37792,0.952661,creature
6,43461,0.951796,megafault
7,35181,0.949791,конек-горбунок
8,35604,0.949132,public enemies
9,3536,0.947949,quatermass and the pit


# TODO

- Add `original_title`, `keywords`, `tagline` and other metadata to train sample and then retrain embeddings;
- Make visualization of embeddings with links of films with each other;
- Compare results with the embeddings we created in lecture
- Write function get_recommendations() which takes arguments we used 2.3., but such that we can use embeddings of several watched films to get recommendations

### Data Preparation

In [27]:
# read csv information about films etc
from ast import literal_eval

movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [28]:
# let's see what columns we have
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [29]:
# filter cols
sample = movies_metadata[['id', 'original_title', 'overview', 'tagline', 'genres', 'belongs_to_collection']].copy()
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     45466 non-null  object
 1   original_title         45466 non-null  object
 2   overview               44512 non-null  object
 3   tagline                20412 non-null  object
 4   genres                 45466 non-null  object
 5   belongs_to_collection  4494 non-null   object
dtypes: object(6)
memory usage: 2.1+ MB


In [30]:
# fillnan
sample.loc[sample['overview'].isnull(), 'overview'] = sample.loc[sample['overview'].isnull(), 'original_title']
sample['tagline'] = sample['tagline'].fillna('No tagline')
sample['belongs_to_collection'] = sample['belongs_to_collection'].fillna('{\'name\': \'No collection\'}')
sample.isnull().sum()

id                       0
original_title           0
overview                 0
tagline                  0
genres                   0
belongs_to_collection    0
dtype: int64

In [31]:
sample['genres'] = sample['genres'].apply(literal_eval)
sample['belongs_to_collection'] = sample['belongs_to_collection'].apply(literal_eval)

In [32]:
# clean genres
sample['genres'] = sample['genres'].apply(lambda x: ' Genres ' + ' '.join([el['name'] for el in x]))
sample['genres']

0          Genres Animation Comedy Family
1         Genres Adventure Fantasy Family
2                   Genres Romance Comedy
3             Genres Comedy Drama Romance
4                           Genres Comedy
                       ...               
45461                 Genres Drama Family
45462                        Genres Drama
45463        Genres Action Drama Thriller
45464                             Genres 
45465                             Genres 
Name: genres, Length: 45466, dtype: object

In [33]:
# clean belongs_to_collection
sample['belongs_to_collection'] = sample['belongs_to_collection'].apply(lambda x: ' Collection ' + x['name'] + ' ' if type(x) == dict else ' No collection ')
sample['belongs_to_collection']

0                   Collection Toy Story Collection 
1                          Collection No collection 
2              Collection Grumpy Old Men Collection 
3                          Collection No collection 
4         Collection Father of the Bride Collection 
                            ...                     
45461                      Collection No collection 
45462                      Collection No collection 
45463                      Collection No collection 
45464                      Collection No collection 
45465                      Collection No collection 
Name: belongs_to_collection, Length: 45466, dtype: object

In [34]:
# define model_index and make it as string
sample = sample.reset_index().rename(columns = {'index': 'model_index'})
sample['model_index'] = sample['model_index'].astype(str)

In [35]:
sample['overview'] = sample['original_title'] + sample['genres'] + sample['belongs_to_collection'] + sample['overview']
sample['overview']

0        Toy Story Genres Animation Comedy Family Colle...
1        Jumanji Genres Adventure Fantasy Family Collec...
2        Grumpier Old Men Genres Romance Comedy Collect...
3        Waiting to Exhale Genres Comedy Drama Romance ...
4        Father of the Bride Part II Genres Comedy Coll...
                               ...                        
45461    رگ خواب Genres Drama Family Collection No coll...
45462    Siglo ng Pagluluwal Genres Drama Collection No...
45463    Betrayal Genres Action Drama Thriller Collecti...
45464    Satana likuyushchiy Genres  Collection No coll...
45465    Queerama Genres  Collection No collection 50 y...
Name: overview, Length: 45466, dtype: object

In [36]:
# create mapper with title and model_idnex to use it further in evaluation
movies_inv_mapper = dict(zip(sample['original_title'].str.lower(), sample['model_index'].astype(int)))

In [37]:
# preprocess by removing non-character data, stopwords
tags_corpus = sample['overview'].values
tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
stop_words = stopwords.words('english')

tags_doc = [word_tokenize_clean(description, stop_words) for description in tags_corpus]
tags_corpus[:1]

["Toy Story Genres Animation Comedy Family Collection Toy Story Collection Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."]

In [38]:
# prepare data as model input for Word2Vec
## it takes some time to execute
tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

In [39]:
# let's check what do we have
## tag = movie index
tags_doc[1]

TaggedDocument(words=['door', 'finish', 'collection', 'three', 'genres', 'siblings', 'creatures', 'fantasy', 'world', 'peter', 'trapped', 'magical', 'risky', 'monkeys', 'freedom', 'jumanji', 'years', 'discover', 'board', 'unwittingly', 'proves', 'room', 'invite', 'terrifying', 'rhinoceroses', 'giant', 'alan', 'opens', 'living', 'adventure', 'evil', 'adult', 'inside', 'game', 'find', 'enchanted', 'judy', 'family', 'hope', 'running'], tags=['1'])

In [40]:
sample

Unnamed: 0,model_index,id,original_title,overview,tagline,genres,belongs_to_collection
0,0,862,Toy Story,Toy Story Genres Animation Comedy Family Colle...,No tagline,Genres Animation Comedy Family,Collection Toy Story Collection
1,1,8844,Jumanji,Jumanji Genres Adventure Fantasy Family Collec...,Roll the dice and unleash the excitement!,Genres Adventure Fantasy Family,Collection No collection
2,2,15602,Grumpier Old Men,Grumpier Old Men Genres Romance Comedy Collect...,Still Yelling. Still Fighting. Still Ready for...,Genres Romance Comedy,Collection Grumpy Old Men Collection
3,3,31357,Waiting to Exhale,Waiting to Exhale Genres Comedy Drama Romance ...,Friends are the people who let you be yourself...,Genres Comedy Drama Romance,Collection No collection
4,4,11862,Father of the Bride Part II,Father of the Bride Part II Genres Comedy Coll...,Just When His World Is Back To Normal... He's ...,Genres Comedy,Collection Father of the Bride Collection
...,...,...,...,...,...,...,...
45461,45461,439050,رگ خواب,رگ خواب Genres Drama Family Collection No coll...,Rising and falling between a man and woman,Genres Drama Family,Collection No collection
45462,45462,111109,Siglo ng Pagluluwal,Siglo ng Pagluluwal Genres Drama Collection No...,No tagline,Genres Drama,Collection No collection
45463,45463,67758,Betrayal,Betrayal Genres Action Drama Thriller Collecti...,A deadly game of wits.,Genres Action Drama Thriller,Collection No collection
45464,45464,227506,Satana likuyushchiy,Satana likuyushchiy Genres Collection No coll...,No tagline,Genres,Collection No collection


### Model Training

In [41]:
VEC_SIZE = 50
ALPHA = .02
MIN_ALPHA = .00025
MIN_COUNT = 5
EPOCHS = 20

In [42]:
# initialize
model = Doc2Vec(vector_size = VEC_SIZE,
                alpha = ALPHA, 
                min_alpha = MIN_ALPHA,
                min_count = MIN_COUNT,
                dm = 0)

In [43]:
# generate vocab from all tag docs
model.build_vocab(tags_doc)

In [44]:
# train model
model.train(tags_doc,
            total_examples = model.corpus_count,
            epochs = EPOCHS)

### Evaluate the model

In [45]:
# load trained embeddings 
movies_vectors_new = model.dv.vectors

In [67]:
movie_embeddings_new = movies_vectors_new[movie_id]
movie_embeddings_new

array([ 0.02610032,  0.0983834 ,  0.2911161 , -0.130118  , -0.0791775 ,
        0.155675  , -0.27082962,  0.12944512, -0.0008263 ,  0.07356373,
       -0.30255178, -0.10524218, -0.06815115, -0.20901138,  0.11384367,
       -0.14403443,  0.47265053, -0.33359542,  0.15665263, -0.10688655,
        0.02229136,  0.06337205,  0.22179082,  0.21460328,  0.11989243,
        0.25572547,  0.03074471, -0.11144271,  0.19928718,  0.05845369,
       -0.0605369 , -0.06422596,  0.0419288 ,  0.13988827, -0.07086156,
       -0.00450677,  0.08879308,  0.02068744,  0.25151628,  0.3652191 ,
       -0.13872463,  0.12602226, -0.10161744,  0.24764714,  0.18513668,
       -0.45469564, -0.20842868, -0.17351568, -0.39435187,  0.37493384],
      dtype=float32)

In [47]:
# get recommendations
similars = model.docvecs.most_similar(positive = [movie_embeddings_new], topn = 20)
output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])
output.head()

Unnamed: 0,model_index,model_score
0,8603,1.0
1,3318,0.940509
2,4590,0.935053
3,42020,0.930219
4,29872,0.929527


In [48]:
# reverse values and indices to map names in dataframe
name_mapper = {v: k for k, v in movies_inv_mapper.items()}

In [49]:
output['title_name'] = output['model_index'].astype(int).map(name_mapper)
output


Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,3318,0.940509,
2,4590,0.935053,osmosis jones
3,42020,0.930219,death race 2050
4,29872,0.929527,angels die hard
5,44227,0.929101,the wild world of batwoman
6,30930,0.928415,teen titans: trouble in tokyo
7,6417,0.92602,spy kids 3-d: game over
8,3580,0.924119,mad max
9,17204,0.919904,attack the block


### Visualizations, compare results

In [50]:
import umap
import umap.plot
import matplotlib.pyplot as plt

In [51]:
mapper = umap.UMAP().fit(movies_vectors)
hover_data = sample[['id', 'original_title']]

umap.plot.output_notebook()
p = umap.plot.interactive(mapper, hover_data=hover_data, point_size=2)
umap.plot.show(p)

In [53]:
mapper = umap.UMAP().fit(movies_vectors_new)
hover_data = sample[['id', 'original_title', 'genres']]

umap.plot.output_notebook()
p = umap.plot.interactive(mapper,
                          labels=hover_data['genres'].apply(lambda x: x.strip().split()[1] if len(x.strip().split()) > 1 else 'Unknown'),
                          hover_data=hover_data, point_size=2)
umap.plot.show(p)

We can notice from the results of both the graphs and the predictions for the 'batman' movie that the newer model is better at finding simmilar movies. It can now understand the genres and group them together, which resulted in superhero/action movies recommended from the batman embedings


### Improved recommendations

In [75]:
movie_id = movies_inv_mapper['batman']
movie_id

8603

In [78]:
# assuming we get the IDs as above
def get_reccomendations(watched_movies: list):
    pos = []
    for id in watched_movies:
        pos.append(movies_vectors_new[id])

    # get recommendations
    similars = model.docvecs.most_similar(positive = pos, topn = 20)
    output = pd.DataFrame(similars, columns = ['model_index', 'model_score'])

    # reverse values and indices to map names in dataframe
    name_mapper = {v: k for k, v in movies_inv_mapper.items()}

    output['title_name'] = output['model_index'].astype(int).map(name_mapper)
    return output

In [79]:
get_reccomendations([8603])

Unnamed: 0,model_index,model_score,title_name
0,8603,1.0,batman
1,3318,0.940509,
2,4590,0.935053,osmosis jones
3,42020,0.930219,death race 2050
4,29872,0.929527,angels die hard
5,44227,0.929101,the wild world of batwoman
6,30930,0.928415,teen titans: trouble in tokyo
7,6417,0.92602,spy kids 3-d: game over
8,3580,0.924119,mad max
9,17204,0.919904,attack the block


In [80]:
movie_id = movies_inv_mapper['man of steel']
movie_id

21068

In [81]:
get_reccomendations([8603, 21068])

Unnamed: 0,model_index,model_score,title_name
0,21068,0.965587,man of steel
1,43162,0.96326,地球[テラ]へ...
2,22893,0.956076,bionicle: mask of light
3,34515,0.953095,cosmic scrat-tastrophe
4,8603,0.952179,batman
5,14178,0.950391,battle for terra
6,26559,0.950243,avatar 2
7,20807,0.950095,planet hulk
8,3582,0.949748,mad max beyond thunderdome
9,44219,0.948925,manhunt in space


# Appendix

Here, we wrap up all pipeline into functions to re-use if needed and it is just prettier to code this way :)

In [None]:
def get_clean_tags_array(agg_tags: pd.DataFrame,
                         text_col = 'tag'):
    '''text preprocessing
    '''
    tags_corpus = agg_tags[text_col].values
    tags_corpus = [re.sub('-[!/()0-9]', '', x) for x in tags_corpus]
    stop_words = stopwords.words('english')


    # preprocess corpus of movie tags before feeding it into Doc2Vec model
    tags_doc = [TaggedDocument(words = word_tokenize_clean(D, stop_words), tags = [str(i)]) for i, D in enumerate(tags_corpus)]

    return tags_doc


In [None]:
def train_embeddings(tags_doc: np.array,
                     epochs = 20,
                     vec_size = 50,
                     alpha = .02,
                     min_alpha =  0.00025,
                     min_count = 5,
                     save_path: str = None):
    """
    fit doc2vec model to prepared corpus
    :tags_doc: result of get_clean_tags_array()
    :max_epocs: int
    :vec_size: int
    :alpha: float
    """
    #initialize
    model = Doc2Vec(vector_size = vec_size,
                    alpha = alpha, 
                    min_alpha = min_alpha,
                    min_count = min_count,
                    dm = 0)
    
    #generate vocab from all tag docs
    model.build_vocab(tags_doc)
    
    #train model
    model.train(tags_doc,
                total_examples = model.corpus_count,
                epochs = epochs)
    
    #save model to dir
    if save_path:
        model.save(f'{save_path}/d2v_model.pkl')
    
    return model