In [1]:
from collections import Counter
import json
import random

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import svm
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Reshape
from tensorflow.keras.layers import Dot

In [2]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [3]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(3)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867)]

In [4]:
#  3回以上現れた link を抽出
top_links = [link for link, c in link_counts.items() if c >= 3]

# リンクと映画からインデックスを逆引きできるようにしておく
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}

pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)

In [5]:
len(pairs), len(top_links), len(movie_to_idx)

(949544, 66913, 10000)

In [6]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
link (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
link_embedding (Embedding)      (None, 1, 50)        3345650     link[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        500000      movie[0][0]                      
______________________________________________________________________________________________

In [7]:
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=10):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples

        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1

        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

In [8]:
next(batchifier(pairs, positive_samples=3, negative_ratio=2))

({'link': array([31254.,  3801., 32643., 32318., 48731., 20558., 22418.,  1313.,
         13365.]),
  'movie': array([5530., 5874., 7628., 7685., 1854.,  849., 1529., 7236., 6238.])},
 array([ 1., -1., -1., -1., -1., -1.,  1.,  1., -1.]))

In [9]:
positive_samples_per_batch = 512

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/15
1854/1854 - 48190s - loss: 0.5236
Epoch 2/15
1854/1854 - 87s - loss: 0.2376
Epoch 3/15
1854/1854 - 87s - loss: 0.2277
Epoch 4/15
1854/1854 - 88s - loss: 0.2243
Epoch 5/15
1854/1854 - 88s - loss: 0.2225
Epoch 6/15
1854/1854 - 87s - loss: 0.2204
Epoch 7/15
1854/1854 - 88s - loss: 0.2207
Epoch 8/15
1854/1854 - 88s - loss: 0.2210
Epoch 9/15
1854/1854 - 88s - loss: 0.2204
Epoch 10/15
1854/1854 - 88s - loss: 0.2203
Epoch 11/15
1854/1854 - 88s - loss: 0.2211
Epoch 12/15
1854/1854 - 88s - loss: 0.2209
Epoch 13/15
1854/1854 - 88s - loss: 0.2202
Epoch 14/15
1854/1854 - 88s - loss: 0.2211
Epoch 15/15
1854/1854 - 88s - loss: 0.2212


<tensorflow.python.keras.callbacks.History at 0x7f10f36442b0>

In [10]:
movie_embedding = model.get_layer('movie_embedding')
movie_weights = movie_embedding.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movie_embedding = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movie_embedding, normalized_movie_embedding[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

29 Rogue One 0.99999994
19 Interstellar (film) 0.97400594
245 Gravity (film) 0.9713407
25 Star Wars sequel trilogy 0.96186566
3349 Star Wars: The Force Awakens 0.96114933
101 Prometheus (2012 film) 0.95846266
659 Rise of the Planet of the Apes 0.9571362
37 Avatar (2009 film) 0.95703185
62 Fantastic Beasts and Where to Find Them (film) 0.9515644
181 Pacific Rim (film) 0.9513081


In [11]:
best = ['Star Wars: The Force Awakens', 'The Martian (film)', 'Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']
worst = ['American Ultra', 'The Cobbler (2014 film)', 'Entourage (film)', 'Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)', 'Mortdecai (film)', 'Serena (2014 film)', 'Vacation (2015 film)']

y = np.asarray([1 for _ in best] + [0 for _ in worst])
x = np.asarray([normalized_movie_embedding[movie_to_idx[movie]] for movie in best + worst])

In [12]:
clf = svm.SVC(kernel='linear')
clf.fit(x, y) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
estimated_movie_ratings = clf.decision_function(normalized_movie_embedding)
best = np.argsort(estimated_movie_ratings)

print('best:')
for c in reversed(best[-5:]):
    print(c, movies[c][0], estimated_movie_ratings[c])

print('worst:')
for c in best[:5]:
    print(c, movies[c][0], estimated_movie_ratings[c])

best:
481 The Devil Wears Prada (film) 1.3734077878642066
66 Skyfall 1.3071895358113084
458 Hugo (film) 1.1964433597583484
307 Les Misérables (2012 film) 1.1901157678426555
939 Changeling (film) 1.120875228619155
worst:
1878 The Little Rascals (film) -1.636526204767615
5097 Ready to Rumble -1.636464345542086
9595 Speed Zone -1.6181485998840797
6388 Bring It On Again -1.6147815272647175
5092 Extreme Movie -1.587607458439102


In [14]:
rotten_y = np.asarray([float(movie[-2][:-1]) / 100 for movie in movies if movie[-2]])
rotten_x = np.asarray([normalized_movie_embedding[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [15]:
TRAINING_CUT_OFF = int(len(rotten_x) * 0.8)
regr = LinearRegression()
regr.fit(rotten_x[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
error = (regr.predict(rotten_x[TRAINING_CUT_OFF:]) - rotten_y[TRAINING_CUT_OFF:])
'mean squared error %2.2f' % np.mean(error ** 2)

'mean squared error 0.06'

In [17]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF]) - rotten_y[TRAINING_CUT_OFF:])
'mean squared error %2.2f' % np.mean(error ** 2)

'mean squared error 0.09'