<a href="https://colab.research.google.com/github/maryamq/MLToys/blob/main/Movie_Rec_cosine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Reference: https://github.com/DOsinga/deep_learning_cookbook/blob/master/04.2%20Build%20a%20recommender%20system%20based%20on%20outgoing%20Wikipedia%20links.ipynb
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import random
import json


In [None]:
with open('wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l.lower()) for l in fin]


We want to collect top publications and sources that mention the movie.

In [None]:
# Mentions in publications
link_counts = Counter()
for movie in movies:
   link_counts.update(movie[2])
print("Most common: " , link_counts.most_common(10))

Most common:  [('rotten tomatoes', 9393), ('category:english-language films', 5883), ('category:american films', 5867), ('variety (magazine)', 5453), ('metacritic', 5117), ('box office mojo', 4191), ('the new york times', 3823), ('the hollywood reporter', 3557), ('roger ebert', 2707), ('los angeles times', 2454)]


In [None]:
top_links = [link for link, c in link_counts.items() if c >= 5]
link_to_idx = {link:idx for idx,link in enumerate(top_links)}
movie_to_idx = {movie[0]:idx for idx,movie in enumerate(movies)}

pairs = []
targets = []
positive_pairs = {}
for movie in movies:
  pairs.extend([link_to_idx[link], movie_to_idx[movie[0]]] for link in movie[2] if link in link_to_idx)
  targets.extend(1 for link in movie[2] if link in link_to_idx)
  positive_pairs[movie_to_idx[movie[0]]] = [link_to_idx[link]  for link in movie[2] if link in link_to_idx]

#pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(864754, 39673, 9999)

In [None]:
import random
def negative_samples(num_negative=10):
  negative_samples = []
  while True:
    movie_id = random.sample(range(0,len(positive_pairs)), 1)[0]
    sampled_link_id = random.sample(range(0, len(link_to_idx)),1)[0]
    #print(movie_id, sampled_link_id) #8482 3302
    if movie_id not in positive_pairs or sampled_link_id not in positive_pairs[movie_id]:
      negative_samples.append([sampled_link_id, movie_id])
      if len(negative_samples) >=num_negative:
        break
  return negative_samples
  

In [None]:
positive_pairs[8483]

[8, 3120, 26, 660]

In [None]:
positive_labels = np.ones(len(pairs))
positive_dataset = tf.data.Dataset.from_tensor_slices((pairs, positive_labels)).shuffle(500)

negative_data = negative_samples(len(pairs) * 3)
negative_labels = np.empty(len(negative_data))
negative_labels.fill(0)
negative_set = tf.data.Dataset.from_tensor_slices((negative_data, negative_labels)).shuffle(500)

In [None]:
pos_ratio = 0.4
neg_ratio = 1 - pos_ratio
batch_size=256
train_ds = tf.data.Dataset.sample_from_datasets(
    [positive_dataset, negative_set], [pos_ratio, neg_ratio]).shuffle(500)
train_set = train_ds.batch(batch_size)
   





In [None]:
train_set


<BatchDataset element_spec=(TensorSpec(shape=(None, 2), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

In [None]:
embed_size=32
model_input = tf.keras.layers.Input(name='input', shape=(2,))
link = tf.gather(model_input, 0, axis=1)
movie = tf.gather(model_input, 1, axis=1)
link_embedding = tf.keras.layers.Embedding(name='link_embedding',
                                    input_dim=len(top_links),
                                    output_dim=embed_size)(link)
movie_embedding = tf.keras.layers.Embedding(name='movie_embedding',
                                    input_dim=len(movie_to_idx),
                                    output_dim=embed_size)(movie)
dotted = tf.keras.layers.Dot(name="dot_product", normalize=True, axes=1)([link_embedding, movie_embedding])
merged =  tf.keras.layers.Dense(1, activation="sigmoid")(dotted)
model_dot = tf.keras.Model(inputs=model_input, outputs=merged)
model_dot.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])
model_dot.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 2)]          0           []                               
                                                                                                  
 tf.compat.v1.gather (TFOpLambd  (None,)             0           ['input[0][0]']                  
 a)                                                                                               
                                                                                                  
 tf.compat.v1.gather_1 (TFOpLam  (None,)             0           ['input[0][0]']                  
 bda)                                                                                             
                                                                                              

In [None]:
# Use Tensorfow data api
history = model_dot.fit(
    train_set,
    epochs=15,
    verbose=2,
)

Epoch 1/15
13512/13512 - 135s - loss: 0.3656 - accuracy: 0.8324 - 135s/epoch - 10ms/step
Epoch 2/15
13512/13512 - 128s - loss: 0.3437 - accuracy: 0.8472 - 128s/epoch - 9ms/step
Epoch 3/15
13512/13512 - 126s - loss: 0.2626 - accuracy: 0.8923 - 126s/epoch - 9ms/step
Epoch 4/15
13512/13512 - 126s - loss: 0.2190 - accuracy: 0.9123 - 126s/epoch - 9ms/step
Epoch 5/15
13512/13512 - 126s - loss: 0.1859 - accuracy: 0.9284 - 126s/epoch - 9ms/step
Epoch 6/15
13512/13512 - 127s - loss: 0.1598 - accuracy: 0.9407 - 127s/epoch - 9ms/step
Epoch 7/15
13512/13512 - 124s - loss: 0.1384 - accuracy: 0.9502 - 124s/epoch - 9ms/step
Epoch 8/15
13512/13512 - 124s - loss: 0.1213 - accuracy: 0.9576 - 124s/epoch - 9ms/step
Epoch 9/15
13512/13512 - 125s - loss: 0.1072 - accuracy: 0.9635 - 125s/epoch - 9ms/step
Epoch 10/15
13512/13512 - 124s - loss: 0.0954 - accuracy: 0.9684 - 124s/epoch - 9ms/step
Epoch 11/15
13512/13512 - 125s - loss: 0.0857 - accuracy: 0.9722 - 125s/epoch - 9ms/step
Epoch 12/15
13512/13512 - 124

In [None]:
model_dot.save('movie_rec_embed.h5')

### Lookup Similar movies 

In [None]:


movie = model_dot.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('the matrix')



83 the matrix 1.0
520 the matrix (franchise) 0.90385664
576 the matrix reloaded 0.87560666
779 the matrix revolutions 0.76862645
516 pearl harbor (film) 0.7670082
240 black hawk down (film) 0.7660147
61 man of steel (film) 0.73709345
32 blade runner 0.7211242
1918 the animatrix 0.707326
155 gladiator (2000 film) 0.7069038


### Lookup Similar Links

In [None]:
link = model_dot.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('rotten tomatoes')

157 rotten tomatoes 1.0000001
192 category:english-language films 0.9766381
461 the new york times 0.9732829
190 category:american films 0.9732761
21 variety (magazine) 0.9604075
290 dvd 0.95995855
22 box office mojo 0.9530238
974 roger ebert 0.9508972
158 metacritic 0.9418756
1402 flixster 0.94064975


# Cluster embeddings to find similar movies

In [None]:
trained_model = tf.keras.models.load_model("movie_rec_embed.h5")
movie = trained_model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T


In [None]:
from sklearn.mixture import GaussianMixture
gm_models = {}
for component in [10,20,50,60,80,100,120]:
  gm = GaussianMixture(n_components=component)
  gm.fit(normalized_movies)
  gm_models[component] = gm
  bic = gm.bic(normalized_movies)
  print("Component: ", component, " BIC:" , bic)

Component:  10  BIC: -280719.4458233196
Component:  20  BIC: -241773.5986047623
Component:  50  BIC: -115986.71353594115
Component:  60  BIC: -71764.98743919481
Component:  80  BIC: 17584.50585651485
Component:  100  BIC: 108122.54596649861
Component:  120  BIC: 197284.8715086562


In [None]:
np.round(gm_models[10].weights_, 2)

array([0.08, 0.12, 0.07, 0.08, 0.07, 0.17, 0.12, 0.16, 0.07, 0.05])

In [None]:
gm = gm_models[10]

In [None]:
all_preds = gm.predict(normalized_movies)

In [None]:
def predict_gm(movie_name):
  movie_input = normalized_movies[movie_to_idx[movie_name]]
  pred = all_preds[movie_to_idx[movie_name]]
  matching_idx = all_preds == pred
  print("Cluster: ", pred, " num_elements=", np.sum(matching_idx))
  return [m for m, cluster in zip(movie_to_idx, matching_idx) if cluster==True]


predict_gm('rogue one')
  

Cluster:  5  num_elements= 299


['deadpool (film)',
 'suicide squad (film)',
 'spectre (2015 film)',
 'warcraft (film)',
 'the martian (film)',
 'list of marvel cinematic universe films',
 'x-men (film series)',
 'the jungle book (2016 film)',
 '10 cloverfield lane',
 'dc universe animated original movies',
 'star trek beyond',
 'interstellar (film)',
 'ant-man (film)',
 'everest (2015 film)',
 'jurassic world',
 'gods of egypt (film)',
 'star wars sequel trilogy',
 'the conjuring 2',
 'rogue one',
 'finding dory',
 'harry potter (film series)',
 'doctor strange (film)',
 'furious 7',
 'avatar (2009 film)',
 'guardians of the galaxy (film)',
 'the avengers (2012 film)',
 'crimson peak',
 'fantastic four (2015 film)',
 'london has fallen',
 'divergent (film)',
 'bourne (film series)',
 'pirates of the caribbean (film series)',
 'the man from u.n.c.l.e. (film)',
 'man of steel (film)',
 'fantastic beasts and where to find them (film)',
 'the fast and the furious',
 'krampus (film)',
 'skyfall',
 'dc extended universe',

### I am feeling lucky

In [None]:
def feeling_lucky():
  sampled = gm.sample()
  dists = np.dot(normalized_movies, sampled[0][0])
  closest = np.argsort(dists)[-1]
  closest_movie_name = movies[closest][0]
  print("Closest movie: ", closest_movie_name)
  print(predict_gm(closest_movie_name))

feeling_lucky()


Closest movie:  khuda kay liye
Cluster:  7  num_elements= 398
['airlift (film)', 'fan (film)', 'tamasha (film)', 'hate story 3', 'sanam teri kasam (2016 film)', 'sanam re', 'housefull 3', 'neerja', 'great grand masti', 'azhar (film)', 'welcome back (film)', 'kyaa kool hain hum 3', 'singh is bliing', 'fitoor', 'rocky handsome', 'baaghi (2016 film)', 'wazir (film)', 'talvar (film)', 'shaandaar', 'pk (film)', 'dishoom', 'slumdog millionaire', 'mastizaade', 'pyaar ka punchnama 2', '2.0 (film)', 'phantom (2015 film)', 'drishyam (2015 film)', 'brothers (2015 film)', 'jazbaa', 'hero (2015 hindi film)', 'rustom (film)', 'kis kisko pyaar karoon', 'ki & ka', 'do lafzon ki kahani (film)', 'one night stand (2016 film)', 'calendar girls (2015 film)', 'te3n', 'katti batti', 'raman raghav 2.0', 'a flying jatt', 'sarbjit (film)', 'hamari adhuri kahani', 'piku', 'badlapur (film)', 'happy new year (2014 film)', 'main aur charles', 'dum laga ke haisha', 'aashiqui 2', 'aligarh (film)', 'yeh jawaani hai de