<a href="https://colab.research.google.com/github/nickgreenquist/recsys/blob/main/MovieLens_Two_Tower_Embedding_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import torch
import torch.nn.functional as F
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
num_ratings_to_read = 500_000

df_ratings = pd.read_csv('ratings.csv', nrows=num_ratings_to_read)

In [8]:
len(df_ratings)

100836

In [9]:
# clean the ratings data
df_ratings = df_ratings.dropna()
df_ratings['movieId'] = df_ratings['movieId'].astype(int)

In [10]:
df_ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [11]:
df_movies = pd.read_csv('movies.csv')

In [12]:
df_movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


# Movie Feature Processing

In [31]:
# let's only work with movies with enough ratings.

num_movies_to_keep = 500

movieId_to_num_ratings = {}
for movieId in df_ratings.movieId.unique().tolist():
  movieId_to_num_ratings[movieId] = 0

movieId_list = df_ratings.movieId.tolist()

for i in range(len(movieId_list)):
  movieId = movieId_list[i]
  movieId_to_num_ratings[movieId] += 1

top_movies = []
for movieId, num_ratings in list(sorted(movieId_to_num_ratings.items(), key=lambda item: item[1], reverse=True))[0:num_movies_to_keep]:
  top_movies.append(movieId)

In [32]:
# map movieId to title
movieId_to_title = {}
title_to_movieId = {}

movieId_list = df_movies.movieId.tolist()
title_list = df_movies.title.tolist()

for i in range(len(movieId_list)):
  movieId = movieId_list[i]
  title = title_list[i]
  if movieId not in top_movies:
    continue

  movieId_to_title[movieId] = title
  title_to_movieId[title] = movieId

In [33]:
# print the top movies
for movieId in top_movies[0:10]:
  print(movieId_to_title[movieId], movieId_to_num_ratings[movieId])

Forrest Gump (1994) 329
Shawshank Redemption, The (1994) 317
Pulp Fiction (1994) 307
Silence of the Lambs, The (1991) 279
Matrix, The (1999) 278
Star Wars: Episode IV - A New Hope (1977) 251
Jurassic Park (1993) 238
Braveheart (1995) 237
Terminator 2: Judgment Day (1991) 224
Schindler's List (1993) 220


In [34]:
# map movieId to list of genres for that movie
genres = set()
movieId_to_genres = {}

movieId_list = df_movies.movieId.tolist()
genre_list = df_movies.genres.tolist()

for i in range(len(movieId_list)):
  movieId = movieId_list[i]
  if movieId not in top_movies:
    continue

  movieId_to_genres[movieId] = set()

  for genre in genre_list[i].split('|'):
    genres.add(genre)
    movieId_to_genres[movieId].add(genre)

In [35]:
movieId_to_genres[title_to_movieId['Matrix, The (1999)']]

{'Action', 'Sci-Fi', 'Thriller'}

In [36]:
# for every genre, get the top movies that have that genre.
# we will use this to make sure our user 'watch history' feature vector
# has enough movies from every genre.
genre_to_movie_to_num_ratings = {}
for genre in genres:
  genre_to_movie_to_num_ratings[genre] = {}

for movieId in movieId_to_genres.keys():
  for genre in movieId_to_genres[movieId]:
    genre_to_movie_to_num_ratings[genre][movieId] = movieId_to_num_ratings[movieId]

for genre in genre_to_movie_to_num_ratings.keys():
  genre_to_movie_to_num_ratings[genre] = sorted(genre_to_movie_to_num_ratings[genre].items(), key=lambda item: item[1], reverse=True)

genre_to_top_movies = {}
for genre in genre_to_movie_to_num_ratings.keys():
  genre_to_top_movies[genre] = []
  for movieId,_ in genre_to_movie_to_num_ratings[genre]:
    genre_to_top_movies[genre].append(movieId)

In [37]:
# for the user watch history feature, let's use some movies from each genre
num_movies_per_genre = 10
user_context_movies = set()

for genre in genre_to_top_movies.keys():
  for movieId in genre_to_top_movies[genre][0:num_movies_per_genre]:
    user_context_movies.add(movieId)

len(genres), len(user_context_movies)

(19, 102)

In [38]:
# build ITEM movieId embedding mapping
item_emb_movieId_to_i = {s:i for i,s in enumerate(top_movies)}
item_emb_i_to_movieId = {i:s for s,i in enumerate(item_emb_movieId_to_i.items())}

In [39]:
# build ITEM genre feature context
genre_to_i = {s:i for i,s in enumerate(genres)}
i_to_genre = {i:s for s,i in genre_to_i.items()}

# User Feature Processing

In [40]:
# for every user, get list of all movies they have watched (with rating)
user_to_movie_to_rating = {}
for userId in df_ratings.userId.unique().tolist():
  user_to_movie_to_rating[userId] = {}

userId_list = df_ratings.userId.tolist()
movieId_list = df_ratings.movieId.tolist()
rating_list = df_ratings.rating.tolist()

for i in range(len(userId_list)):
  userId = userId_list[i]
  movieId = movieId_list[i]
  rating = rating_list[i]

  if movieId not in top_movies:
    continue

  user_to_movie_to_rating[userId][movieId] = rating

In [42]:
# build the USER context
user_context_size = len(user_context_movies) + len(genres)

user_context_movieId_to_i = {s:i for i,s in enumerate(list(user_context_movies))}
user_context_i_to_movieId = {i:s for s,i in enumerate(user_context_movieId_to_i.items())}

user_context_genre_to_i = {s:i for i,s in enumerate(list(genres))}
user_context_i_to_genre = {i:s for s,i in enumerate(user_context_genre_to_i.items())}

# Genreate Training Examples