In [1]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding, Input, Reshape, Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [5]:
movies

[['Deadpool (film)',
  {'image': 'Deadpool poster.jpg',
   'name': 'Deadpool',
   'cinematography': 'Ken Seng',
   'Software Used': 'Adobe Premier Pro',
   'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
   'distributor': '20th Century Fox',
   'caption': 'Theatrical release poster',
   'gross': '$783.1 million',
   'country': 'United States',
   'director': 'Tim Miller',
   'runtime': '108 minutes',
   'editing': 'Julian Clarke',
   'language': 'English',
   'music': 'Tom Holkenborg',
   'budget': '$58 million'},
  ['Tim Miller (director)',
   'Simon Kinberg',
   'Ryan Reynolds',
   'Lauren Shuler Donner',
   'Rhett Reese',
   'Paul Wernick',
   'Deadpool',
   'Fabian Nicieza',
   'Rob Liefeld',
   'Morena Baccarin',
   'Ed Skrein',
   'T.J. Miller',
   'Gina Carano',
   'Leslie Uggams',
   'Brianna Hildebrand'

In [3]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

In [4]:
link_counts

Counter({'Rotten Tomatoes': 9393,
         'Category:English-language films': 5882,
         'Category:American films': 5867,
         'Variety (magazine)': 5450,
         'Metacritic': 5112,
         'Box Office Mojo': 4186,
         'The New York Times': 3818,
         'The Hollywood Reporter': 3553,
         'Roger Ebert': 2707,
         'Los Angeles Times': 2454,
         'Entertainment Weekly': 2375,
         'British Board of Film Classification': 2236,
         'Chicago Sun-Times': 1826,
         'Deadline.com': 1814,
         'The Guardian': 1528,
         'American Film Institute': 1516,
         'DVD': 1502,
         'Category:Indian films': 1452,
         'Warner Bros.': 1424,
         'New York City': 1293,
         'The Times of India': 1221,
         'drama film': 1168,
         '20th Century Fox': 1147,
         'Category:Directorial debut films': 1142,
         'Principal photography': 1119,
         'Paramount Pictures': 1094,
         'Category:British films': 1073,
 

In [5]:
top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [8]:
pairs

[(0, 0),
 (1, 0),
 (2, 0),
 (3, 0),
 (4, 0),
 (5, 0),
 (6, 0),
 (7, 0),
 (8, 0),
 (9, 0),
 (10, 0),
 (11, 0),
 (12, 0),
 (13, 0),
 (14, 0),
 (15, 0),
 (16, 0),
 (17, 0),
 (3, 0),
 (18, 0),
 (19, 0),
 (20, 0),
 (21, 0),
 (22, 0),
 (23, 0),
 (0, 0),
 (4, 0),
 (5, 0),
 (24, 0),
 (6, 0),
 (25, 0),
 (2, 0),
 (7, 0),
 (8, 0),
 (9, 0),
 (10, 0),
 (11, 0),
 (12, 0),
 (13, 0),
 (26, 0),
 (27, 0),
 (19, 0),
 (28, 0),
 (29, 0),
 (30, 0),
 (31, 0),
 (32, 0),
 (33, 0),
 (34, 0),
 (35, 0),
 (36, 0),
 (37, 0),
 (38, 0),
 (39, 0),
 (40, 0),
 (41, 0),
 (6, 0),
 (42, 0),
 (43, 0),
 (44, 0),
 (45, 0),
 (46, 0),
 (47, 0),
 (48, 0),
 (49, 0),
 (50, 0),
 (51, 0),
 (52, 0),
 (53, 0),
 (54, 0),
 (55, 0),
 (2, 0),
 (6, 0),
 (56, 0),
 (28, 0),
 (57, 0),
 (7, 0),
 (43, 0),
 (58, 0),
 (8, 0),
 (45, 0),
 (59, 0),
 (0, 0),
 (9, 0),
 (49, 0),
 (1, 0),
 (60, 0),
 (10, 0),
 (46, 0),
 (11, 0),
 (50, 0),
 (61, 0),
 (12, 0),
 (52, 0),
 (53, 0),
 (62, 0),
 (63, 0),
 (64, 0),
 (65, 0),
 (13, 0),
 (51, 0),
 (66, 0),
 (67, 0

In [9]:
len(pairs_set)

671403

In [10]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()
model.summary()

In [11]:
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([ 3801., 13365.,  1313., 48731., 22418., 32318., 31254., 32643.,
         20558.]),
  'movie': array([5874., 6238., 7236., 1854., 1529., 7685., 5530., 7628.,  849.])},
 array([-1., -1.,  1., -1.,  1., -1.,  1., -1., -1.]))

In [13]:
positive_samples_per_batch = 512

model.fit(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)


Epoch 1/15
1854/1854 - 23s - 13ms/step - loss: 0.4614
Epoch 2/15
1854/1854 - 24s - 13ms/step - loss: 0.2288
Epoch 3/15
1854/1854 - 23s - 13ms/step - loss: 0.2213
Epoch 4/15
1854/1854 - 23s - 12ms/step - loss: 0.2188
Epoch 5/15
1854/1854 - 24s - 13ms/step - loss: 0.2171
Epoch 6/15
1854/1854 - 23s - 13ms/step - loss: 0.2161
Epoch 7/15
1854/1854 - 25s - 13ms/step - loss: 0.2154
Epoch 8/15
1854/1854 - 24s - 13ms/step - loss: 0.2148
Epoch 9/15
1854/1854 - 24s - 13ms/step - loss: 0.2144
Epoch 10/15
1854/1854 - 23s - 13ms/step - loss: 0.2140
Epoch 11/15
1854/1854 - 24s - 13ms/step - loss: 0.2135
Epoch 12/15
1854/1854 - 24s - 13ms/step - loss: 0.2136
Epoch 13/15
1854/1854 - 29s - 15ms/step - loss: 0.2132
Epoch 14/15
1854/1854 - 27s - 14ms/step - loss: 0.2130
Epoch 15/15
1854/1854 - 25s - 14ms/step - loss: 0.2129


<keras.src.callbacks.history.History at 0x31d3d3150>

In [23]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Interstellar (film)')

19 Interstellar (film) 0.9999999
29 Rogue One 0.977658
181 Pacific Rim (film) 0.9771952
372 The Amazing Spider-Man (2012 film) 0.9746711
1159 Cowboys & Aliens 0.97178423
37 Avatar (2009 film) 0.970487
3349 Star Wars: The Force Awakens 0.9672259
22 Jurassic World 0.96393156
154 Star Trek (film) 0.9621705
727 The Lone Ranger (2013 film) 0.95978314


In [24]:
similar_movies('Jurassic World')

22 Jurassic World 0.9999999
42 The Avengers (2012 film) 0.98259676
143 Iron Man 3 0.982248
39 Guardians of the Galaxy (film) 0.9798821
182 The Amazing Spider-Man 2 0.9795326
372 The Amazing Spider-Man (2012 film) 0.97858477
7 List of Marvel Cinematic Universe films 0.977698
34 Doctor Strange (film) 0.97717416
154 Star Trek (film) 0.9769297
1364 Captain America: Civil War 0.9669252


In [28]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lengths = np.linalg.norm(link_weights, axis=1)
normalized_links = (link_weights.T / link_lengths).T

def similar_links(link):
    dists = np.dot(normalized_links, normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-20:]
    for c in reversed(closest):
        print(c, top_links[c], dists[c])

similar_links('Playboy')

4759 Playboy 0.99999994
9779 Homage (arts) 0.94761646
3040 Simon & Schuster 0.9419767
4986 HarperCollins 0.9403036
9823 production designer 0.9366136
1000 sound stage 0.9340045
4315 backlot 0.9331386
1671 dailies 0.9323585
7731 A Clockwork Orange (film) 0.930023
2828 2001: A Space Odyssey (film) 0.9296802
2810 film treatment 0.92345124
1481 Writers Guild of America, West 0.91913766
1659 AMC (TV channel) 0.9157436
3176 Star Wars (film) 0.9154033
11429 Greenwood Publishing Group 0.9126805
57 fourth wall 0.9087169
976 Hugo Award for Best Dramatic Presentation 0.9085569
4360 Cinefantastique 0.90833324
24850 Danny Peary 0.9074262
2983 Hugo Award 0.90700555


In [29]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']
y = np.asarray([1 for _ in best] + [0 for _ in worst])
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])
X.shape

(16, 50)

In [31]:
X

array([[-2.20538881e-02,  5.12575470e-02, -3.31239887e-02,
         1.91676512e-01,  7.38116130e-02,  3.43104475e-04,
        -1.00619182e-01, -1.51380077e-01, -2.77176261e-01,
        -8.83824658e-03, -1.94983631e-01, -1.27983853e-01,
         4.38903868e-02, -5.08565940e-02, -4.46352437e-02,
         2.93649375e-01,  8.26580301e-02,  4.60713096e-02,
         1.26945821e-03,  1.07219473e-01,  2.48396352e-01,
        -2.11541384e-01,  3.31399255e-02, -3.69578660e-01,
         1.27033532e-01, -9.20466036e-02, -1.82430312e-01,
        -2.97499169e-03,  1.18851088e-01, -1.36928678e-01,
         8.24035406e-02,  1.80509016e-01,  1.03530951e-01,
        -1.46639004e-01, -4.68623452e-03, -1.19649820e-01,
         5.70928119e-02,  7.03196973e-04, -1.56227723e-01,
         1.52963907e-01, -8.24916884e-02, -7.81723261e-02,
         1.21591397e-01,  5.37102576e-03, -2.01641694e-02,
         1.92222029e-01,  1.20587401e-01,  3.45896304e-01,
         2.77402885e-02, -6.41815364e-02],
       [-1.16

In [32]:
clf = svm.SVC(kernel='linear')
clf.fit(X, y) 

In [34]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])


best:
481 The Devil Wears Prada (film) 1.3668093392832792
66 Skyfall 1.3357702481774065
307 Les Misérables (2012 film) 1.149987369596318
458 Hugo (film) 1.1457964457652572
3 Spectre (2015 film) 1.0863986505143692
worst:
1782 Scooby-Doo! WrestleMania Mystery -1.5936027524413396
5097 Ready to Rumble -1.5532665401699348
1878 The Little Rascals (film) -1.5468364579573728
9595 Speed Zone -1.5391414330643278
8559 Air Buddies -1.519867513186747


In [37]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [38]:
TRAINING_CUT_OFF = int(len(rotten_X) * 0.8)
regr = LinearRegression()
regr.fit(rotten_X[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

In [39]:
error = (regr.predict(rotten_X[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.06'

In [40]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 0.09'

In [41]:
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ', 1)
    unit = unit.lower()
    if not unit in ('million', 'billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v *= 1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c, movies[c][0], movie_gross[c])

6 The Martian (film) 10900.0
7 List of Marvel Cinematic Universe films 4300.0
49 Back to the Future 3900.0
71 The Conjuring 2932.0
162 Thor (film) 2464.0
36 Furious 7 2340.0
30 Finding Dory 2187.0
1906 Jane Eyre (2011 film) 2068.0
19 Interstellar (film) 1670.0
2251 An American Werewolf in London 1655.0


In [42]:
gross_y = np.asarray([gr for gr in movie_gross if gr])
gross_X = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie, gr in zip(movies, movie_gross) if gr])

In [43]:
TRAINING_CUT_OFF = int(len(gross_X) * 0.8)
regr = LinearRegression()
regr.fit(gross_X[:TRAINING_CUT_OFF], gross_y[:TRAINING_CUT_OFF])

In [44]:
error = (regr.predict(gross_X[TRAINING_CUT_OFF:]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 8883.07'

In [45]:
error = (np.mean(gross_y[:TRAINING_CUT_OFF]) - gross_y[TRAINING_CUT_OFF:])
'mean square error %2.2f' % np.mean(error ** 2)

'mean square error 14115.59'