# Visualizing Embeddings for Movie Recommendation in TensorBoard

(This notebook is based on the one written by [nahidalam](https://github.com/nahidalam/blog/blob/master/Recommender%20Systems%20from%20Learned%20Embedding.ipynb))

## Library Imports

In [1]:
import os
import time

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K

from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier


from urllib.request import urlretrieve
import zipfile
import collections

## Loading the Datasets

In [2]:
## Download Datasets

print("Downloading movielens data...")
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Download complete.")


## Load Datasets into Data Frames

# Users
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

# Movies
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = ['movie_id', 'title', 'release_date', "video_release_date", "imdb_url"] + genre_cols
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')
   
# Ratings
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
users["user_id"] = users["user_id"].apply(lambda x: int(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: int(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: int(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: int(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

Downloading movielens data...
Download complete.


In [3]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,0,24,M,technician,85711
1,1,53,F,other,94043
2,2,23,M,writer,32067
3,3,24,M,technician,43537
4,4,33,F,other,15213


In [4]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,0,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1995
1,1,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,1995
2,2,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
3,3,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,4,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995


In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,195,241,3.0,881250949
1,185,301,3.0,891717742
2,21,376,1.0,878887116
3,243,50,2.0,880606923
4,165,345,1.0,886397596


In [6]:
# Merge datasets into one
data = ratings.merge(movies, on='movie_id').merge(users, on='user_id')

In [7]:
data.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,release_date,video_release_date,imdb_url,genre_unknown,Action,...,Romance,Sci-Fi,Thriller,War,Western,year,age,sex,occupation,zip_code
0,195,241,3.0,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,1997,49,M,writer,55105
1,195,256,2.0,881251577,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,0,1,0,0,0,1997,49,M,writer,55105
2,195,110,4.0,881251793,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,...,1,0,0,0,0,1996,49,M,writer,55105
3,195,24,4.0,881251955,"Birdcage, The (1996)",08-Mar-1996,,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,...,0,0,0,0,0,1996,49,M,writer,55105
4,195,381,4.0,881251843,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,...,0,0,0,0,0,1994,49,M,writer,55105


In [8]:
# Train/Test Split
train, test = train_test_split(data, test_size=0.1)

In [9]:
train

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,release_date,video_release_date,imdb_url,genre_unknown,Action,...,Romance,Sci-Fi,Thriller,War,Western,year,age,sex,occupation,zip_code
85998,88,110,4.0,879441452,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,...,1,0,0,0,0,1996,43,F,administrator,68106
89615,735,747,2.0,878708465,"Saint, The (1997)",14-Mar-1997,,http://us.imdb.com/M/title-exact?Saint%2C%20Th...,0,1,...,1,0,1,0,0,1997,48,F,writer,94618
60726,405,194,5.0,882480710,"Terminator, The (1984)",01-Jan-1984,,"http://us.imdb.com/M/title-exact?Terminator,%2...",0,1,...,0,1,1,0,0,1984,52,M,educator,93109
76618,599,3,4.0,888451908,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,...,0,0,0,0,0,1995,34,M,programmer,02320
68822,325,565,4.0,879877073,Clear and Present Danger (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Clear%20and%2...,0,1,...,0,0,1,0,0,1994,41,M,administrator,15235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28713,176,86,4.0,880130931,Searching for Bobby Fischer (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Searching%20f...,0,0,...,0,0,0,0,0,1993,20,M,programmer,19104
38121,882,153,4.0,891754985,Monty Python's Life of Brian (1979),01-Jan-1979,,http://us.imdb.com/M/title-exact?Life%20of%20B...,0,0,...,0,0,0,0,0,1979,49,M,librarian,50266
43756,647,1059,2.0,882212373,"Adventures of Pinocchio, The (1996)",26-Jul-1996,,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,...,0,0,0,0,0,1996,43,M,engineer,91351
37861,114,55,5.0,881171409,Pulp Fiction (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Pulp%20Fictio...,0,0,...,0,0,0,0,0,1994,31,M,engineer,17110


## Starting up TensorBoard

In [10]:
# Loading TensorBoard Module
%load_ext tensorboard

In [11]:
%tensorboard --logdir %logs

Reusing TensorBoard on port 6006 (pid 7428), started 5:56:14 ago. (Use '!kill 7428' to kill it.)

## Generating the Embeddings

In [12]:
# Define Model Properties
EMBEDDING_SIZE = 10
NUM_USERS = data['user_id'].nunique()
NUM_MOVIES = data['movie_id'].nunique()
ROW_COUNT = train.shape[0]


# Movie Embedding Sub-Model
movie_input = keras.Input(shape=(1,), name='movie_id')

movie_emb = layers.Embedding(output_dim=EMBEDDING_SIZE, input_dim=NUM_MOVIES, input_length=ROW_COUNT, name='movie_emb')(movie_input)
movie_vec = layers.Flatten(name='FlattenMovie')(movie_emb)

movie_model = keras.Model(inputs=movie_input, outputs=movie_vec)


# User Embedding Sub-Model
user_input = keras.Input(shape=(1,), name='user_id')

user_emb = layers.Embedding(output_dim=EMBEDDING_SIZE, input_dim=NUM_USERS, input_length=ROW_COUNT, name='user_emb')(user_input)
user_vec = layers.Flatten(name='FlattenUser')(user_emb)

user_model = keras.Model(inputs=user_input, outputs=user_vec)


# Merged Model
merged = layers.Dot(name = 'dot_product', normalize = True, axes = 2)([movie_emb, user_emb])
merged_dropout = layers.Dropout(0.2)(merged)


dense_1 = layers.Dense(70,name='FullyConnected-1')(merged)
dropout_1 = layers.Dropout(0.2,name='Dropout_1')(dense_1)

dense_2 = layers.Dense(50,name='FullyConnected-2')(dropout_1)
dropout_2 = layers.Dropout(0.2,name='Dropout_2')(dense_2)

dense_3 = keras.layers.Dense(20,name='FullyConnected-3')(dropout_2)
dropout_3 = keras.layers.Dropout(0.2,name='Dropout_3')(dense_3)

dense_4 = keras.layers.Dense(10,name='FullyConnected-4', activation='relu')(dropout_3)

result = layers.Dense(1, name='result', activation="relu") (dense_4)

adam = keras.optimizers.Adam(lr=0.001)
model = keras.Model([movie_input, user_input], result)
model.compile(optimizer=adam,
              loss= 'mean_absolute_error',
              metrics=['accuracy', tf.keras.metrics.MeanSquaredError()])


## Training the Model

In [13]:
# Callback for TensorBoard Visualizations
tensorboard_callback = tf.keras.callbacks.TensorBoard(
                           log_dir='logs', histogram_freq=5, 
                           write_graph=True, write_images=True, 
                           update_freq='epoch', embeddings_freq=5
                       )

# Model Training
callbacks = [
              keras.callbacks.EarlyStopping('val_loss', patience=20),
              tensorboard_callback
            ]

start = time.time()
history = model.fit([train.movie_id, train.user_id],train.rating, batch_size=100,
                              epochs =50, validation_data = ([test.movie_id, test.user_id],test.rating),
                              verbose = 1, callbacks = callbacks)
end = time.time()


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


## Tensorflow Summaries

In [14]:
# Create the SummaryWriter Object
summary_writer = tf.summary.create_file_writer('logs')

In [15]:
# Create the Text Summary Object
with summary_writer.as_default():
    tf.summary.text(name='Training Statistics', data='Time Taken: ' + str(end-start) + ' seconds', step=0)

 ## Recommending Movies

In [16]:
# Extract Movie Embeddings
MOVIE_EMBEDDING_LIST = []
MOVIE_EMBED_MAP = collections.defaultdict()

UNIQUE_MOVIE_IDS = data['movie_id'].unique()

for _id in UNIQUE_MOVIE_IDS:
    emb = movie_model.predict(np.array([_id]))
    val = list(emb.reshape(1,-1))[0]
    MOVIE_EMBEDDING_LIST.insert(_id, val)
    MOVIE_EMBED_MAP[_id] = val

In [17]:
# Train KNN Classifier
knn_train_label = UNIQUE_MOVIE_IDS

clf = KNeighborsClassifier(n_neighbors=11)
clf.fit(MOVIE_EMBEDDING_LIST, knn_train_label)

KNeighborsClassifier(n_neighbors=11)

In [18]:
def recommend_movies(embedding):
    distances, indices = clf.kneighbors(embedding.reshape(1, -1),  n_neighbors=10)
    indices = indices.reshape(10,1)
    df_indices = pd.DataFrame(indices, columns = ['movie_id'])
    return df_indices.merge(movies,on='movie_id',how='inner',suffixes=['_u', '_m'])['title']

In [19]:
# User Request
TEST_USER_ID = 200

In [20]:
# Get User Embedding
user_embedding = user_model.predict([TEST_USER_ID]).reshape(1,-1)[0]

In [21]:
# Recommend Movies using KNN Classifier
recommend_movies(user_embedding)

0                                 Anna Karenina (1997)
1                     Shawshank Redemption, The (1994)
2                                 Feast of July (1995)
3                                  White Squall (1996)
4                                 Shallow Grave (1994)
5    Wonderful, Horrible Life of Leni Riefenstahl, ...
6                                     Fair Game (1995)
7                                   Money Talks (1997)
8                                  Losing Chase (1996)
9                                    Sgt. Bilko (1996)
Name: title, dtype: object