In [2]:
from constants import *
import pandas as pd
import os
from pathlib import Path

# downloaded from https://grouplens.org/datasets/movielens/ -- I'm using 25M dataset
rating_csv_path = f'{data_path}/ratings.csv'
rating_csv_path = Path(rating_csv_path).resolve()#.parents[0].absolute

In [3]:
df_rating = pd.read_csv(rating_csv_path)
df_rating.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [4]:
df_rating.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
459054,3150,36401,2.0,1216309530
17430804,112941,362,2.0,849887784
13163864,85232,4226,3.5,1456668372
16435342,106614,2424,1.0,1237759280
10390703,67415,49272,3.5,1558286496
5482479,35538,2692,4.0,942335764
3512962,23169,4317,2.0,1110381616
17798245,115367,954,3.5,1066712372
18627613,120708,99636,3.5,1462188548
22289104,144959,3996,3.0,1005266539


In [5]:
f'number of unique users = {len(df_rating["userId"].unique())}'

'number of unique users = 162541'

Find the movies liked by each user. For example if user 1 has liked 3 movies 'a', 'b', 'c' then together the list of movies would act like a sentence in conventional NLP problem and each movie would be words. So the more frequently 2 movies appear in each other's vicinity the more similar they must be. In item2vec case the "vicinity" would be the whole sentence/list of movies liked.

In [6]:
df_user_likes = df_rating[df_rating['rating']>=4]
df_user_likes = df_user_likes['movieId'].groupby(df_user_likes['userId']).agg(list).reset_index()
df_user_likes

Unnamed: 0,userId,movieId
0,1,"[296, 307, 665, 1088, 1237, 1250, 1653, 2351, ..."
1,2,"[110, 150, 151, 236, 260, 318, 333, 349, 356, ..."
2,3,"[1, 29, 32, 50, 111, 172, 214, 260, 293, 296, ..."
3,4,"[296, 541, 589, 924, 1036, 1136, 1196, 1197, 1..."
4,5,"[1, 19, 32, 36, 47, 50, 88, 104, 141, 147, 150..."
...,...,...
162337,162537,"[31, 88, 207, 216, 282, 318, 356, 361, 466, 52..."
162338,162538,"[17, 39, 47, 48, 111, 215, 337, 356, 500, 527,..."
162339,162539,"[110, 161, 356, 480, 541, 608, 750, 780, 912, ..."
162340,162540,"[32, 1721, 2205, 3005, 3980, 4167, 4306, 4310,..."


Same as above for movies disliked.

In [7]:
df_user_dislikes = df_rating[df_rating['rating']<4]
df_user_dislikes = df_user_dislikes['movieId'].groupby(df_user_dislikes['userId']).agg(list).reset_index()
df_user_dislikes

Unnamed: 0,userId,movieId
0,1,"[306, 899, 1175, 1217, 1260, 2011, 2012, 2068,..."
1,2,"[1, 62, 261, 266, 380, 480, 524, 553, 588, 653..."
2,3,"[173, 442, 480, 780, 1127, 1198, 1270, 1320, 1..."
3,4,"[1, 260, 780, 1080, 1200, 1201, 1210, 1214, 12..."
4,5,"[39, 95, 113, 122, 153, 191, 218, 219, 235, 23..."
...,...,...
161506,162537,"[48, 376, 505, 551, 700, 1005, 1556, 1566, 158..."
161507,162538,"[1, 260, 296, 551, 597, 902, 1183, 1230, 1244,..."
161508,162539,"[466, 1748, 4370, 4446]"
161509,162540,"[169, 519, 1125, 1367, 2384, 2986, 3159, 3869,..."


combine both liked and disliked into a single list which would be our corpus for training.

In [8]:
movie_groups = list(df_user_likes['movieId']) + list(df_user_dislikes['movieId'])
len(movie_groups)

323853

Next step is to train a Word2Vec model. On my system using skipgram took 1 hour 45 mins to train for 5 epochs while CBOW took only 16 mins to train. Also CBOW seems to recommend more relevant items.

Setting logging to Info as Word2Vec doesn't print any progress log without it.

Also training models for more than 5 epochs doesn't seem to have any improvement on model outputs for neither SG nor CBOW. So clearly CBOW won!

In [77]:
from gensim.models import Word2Vec
import datetime

# Word2Vec doesn't have "overfitting concept. So using more epochs."

import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

start_time = datetime.datetime.now()
max_window = max([len(group) for group in movie_groups])
prev_model = None # 'movie_rec_2023-05-20 03:12:43.294816'
use_skipgram = False

if prev_model:
  model = Word2Vec.load(f'{output_path}/{prev_model}')
  print(f'resuming training: worker={model.workers}')
  model.train(movie_groups, epochs=10, total_examples=model.corpus_count)
else:
  model = Word2Vec(movie_groups, epochs=5, window=max_window,
                  min_count=10, workers=7,
                    sg=1 if use_skipgram else 0, hs=0, negative=5)

print(f'training time = {datetime.datetime.now()-start_time}')
model.save(f'{output_path}/movie_rec_{datetime.datetime.now()}_cbow_epoch5')

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 752872 words, keeping 16469 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 1517195 words, keeping 21102 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 2293768 words, keeping 24467 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 3069414 words, keeping 26783 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 3832289 words, keeping 28567 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 4587089 words, keeping 29782 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 5364753 words, keeping 30935 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 6147106 words, ke

training time = 0:14:49.034797


In [50]:
df_movies = pd.read_csv(f'{data_path}/movies.csv')
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [74]:
def get_movie_name(movieId):
    return df_movies[df_movies['movieId']==movieId]['title'].iloc[0]

def get_recom(movieIds, model):
    for movideId in movieIds:
        recs = model.wv.most_similar(movideId, topn=10)
        query = get_movie_name(movideId)
        results = [(get_movie_name(rec[0]), rec[0], rec[1]) for rec in recs]

        print(f'query movie: {query}({movideId})\nresults (top {len(results)}):')
        for i, result in enumerate(results):
            print(f'{i}. {result[0]} (movieId={result[1]}, confidence={result[2]})')

def get_recom_by_name(movie_name, model):
    query_id = df_movies[df_movies['title'].str.contains(f'(?i){movie_name}')]['movieId'].iloc[0]
    get_recom([query_id], model)

In [78]:
from gensim.models import Word2Vec

model_1 = Word2Vec.load(f'{output_path}/movie_rec_2023-05-20 03:12:43.294816')
model_2 = Word2Vec.load(f'{output_path}/movie_rec_2023-05-20 19:21:56.098519_epoch15')
model_cbow = Word2Vec.load(f'{output_path}/movie_rec_2023-05-20 23:42:42.849724_cbow_epoch5')
    

INFO:gensim.utils:loading Word2Vec object from ../outputs/movie_rec_2023-05-20 03:12:43.294816
INFO:gensim.utils:loading wv recursively from ../outputs/movie_rec_2023-05-20 03:12:43.294816.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': '../outputs/movie_rec_2023-05-20 03:12:43.294816', 'datetime': '2023-05-20T23:46:59.573997', 'gensim': '4.3.1', 'python': '3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]', 'platform': 'Linux-5.19.0-41-generic-x86_64-with-glibc2.17', 'event': 'loaded'}
INFO:gensim.utils:loading Word2Vec object from ../outputs/movie_rec_2023-05-20 19:21:56.098519_epoch15
INFO:gensim.utils:loading wv recursively from ../outputs/movie_rec_2023-05-20 19:21:56.098519_epoch15.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': '../outputs/movie_rec_2023-05-20 19:21:56.098519_epoch15', 'datetime': '2023-

In [62]:
num_queries = 20
count = 0

for row in df_movies[:100].iterrows():
    queryId = row[1]['movieId']
    # idx = model_1.wv.index2word(queryId)
    try:
        get_recom([queryId], model)
        count = count + 1
    except:
        print(f'{get_movie_name(queryId)}({queryId}) is not present in model')
    print('\n\n')
    if count > num_queries:
        break

query movie: Toy Story (1995)(1)
results (top 5):
0. Braveheart (1995) (movieId=110, confidence=0.9528072476387024)
1. Star Wars: Episode IV - A New Hope (1977) (movieId=260, confidence=0.9519764184951782)
2. Twelve Monkeys (a.k.a. 12 Monkeys) (1995) (movieId=32, confidence=0.9381943941116333)
3. Forrest Gump (1994) (movieId=356, confidence=0.9331863522529602)
4. Lion King, The (1994) (movieId=364, confidence=0.9301167726516724)



query movie: Jumanji (1995)(2)
results (top 5):
0. Mask, The (1994) (movieId=367, confidence=0.9698435068130493)
1. Speed (1994) (movieId=377, confidence=0.9634350538253784)
2. Ace Ventura: When Nature Calls (1995) (movieId=19, confidence=0.9631405472755432)
3. Dumb & Dumber (Dumb and Dumber) (1994) (movieId=231, confidence=0.9625581502914429)
4. Batman Forever (1995) (movieId=153, confidence=0.9610676765441895)



query movie: Grumpier Old Men (1995)(3)
results (top 5):
0. Father of the Bride Part II (1995) (movieId=5, confidence=0.9781428575515747)
1. Mult

In [84]:
movie_name = '1408'
print('recommendations by SG Model (5 epcohs)')
get_recom_by_name(movie_name, model_1)
print('\nrecommendations by SG Model (15 epcohs)')
get_recom_by_name(movie_name, model_2)
print('\nrecommendations by CBOW Model (5 epcohs)')
get_recom_by_name(movie_name, model_cbow)

recommendations by SG Model (5 epcohs)
query movie: 1408 (2007)(53953)
results (top 10):
0. Planet Terror (2007) (movieId=54995, confidence=0.9412971138954163)
1. Mist, The (2007) (movieId=56145, confidence=0.9394574165344238)
2. Death Proof (2007) (movieId=53519, confidence=0.9375334978103638)
3. Cloverfield (2008) (movieId=57368, confidence=0.930331826210022)
4. Hostel (2005) (movieId=42723, confidence=0.9293372631072998)
5. 28 Weeks Later (2007) (movieId=53000, confidence=0.9239894151687622)
6. Number 23, The (2007) (movieId=51086, confidence=0.9172055721282959)
7. 2012 (2009) (movieId=72378, confidence=0.9161345362663269)
8. Descent, The (2005) (movieId=40732, confidence=0.9110991954803467)
9. Terminator Salvation (2009) (movieId=68791, confidence=0.9097989797592163)

recommendations by SG Model (15 epcohs)
query movie: 1408 (2007)(53953)
results (top 10):
0. Mist, The (2007) (movieId=56145, confidence=0.9493451714515686)
1. Hostel (2005) (movieId=42723, confidence=0.94635629653930