In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
ratings = pd.read_csv('../data/ml-100k/ratings.csv')

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
ratings['userEmbId'] = ratings['userId'] - 1
ratings['movieEmbId'] = ratings['movieId'] - 1
# Set max_userid to the maximum user_id in the ratings
max_userid = ratings['userId'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = ratings['movieId'].drop_duplicates().max()

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,userEmbId,movieEmbId
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,2
2,1,6,4.0,964982224,0,5
3,1,47,5.0,964983815,0,46
4,1,50,5.0,964982931,0,49


In [44]:
movies = pd.read_csv('../data/ml-100k/movies.csv')

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [55]:
links = pd.read_csv('../data/ml-100k/links.csv')

links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
shuffled_ratings = ratings.sample(frac=1., random_state=42)

# Shuffling users
Users = shuffled_ratings['userEmbId'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movieEmbId'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [431 287 598 ... 479   5 102] , shape = (100836,)
Movies: [77865   473  4350 ...  6866   980  6710] , shape = (100836,)
Ratings: [4.5 3.  3.  ... 4.  3.  5. ] , shape = (100836,)


In [5]:
import keras
from keras.layers import Input, Embedding, dot, Dense, Reshape
from keras.models import Model, Sequential

Using TensorFlow backend.


In [6]:
user_input = Input(shape=(1,), name='user_input')
x = Embedding(input_dim=max_userid, output_dim=100, input_length=1)(user_input)
user_output = Reshape((100,))(x)

In [7]:
movie_input = Input(shape=(1,), name='movie_input')
y = Embedding(input_dim=max_movieid, output_dim=100, input_length=1)(movie_input)
movie_output = Reshape((100,))(y)

In [8]:
z = dot([Dense(10)(user_output), Dense(10)(movie_output)], axes=1)

In [9]:
model = Model(inputs=[user_input, movie_input], outputs=z)

In [10]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 100)       61000       user_input[0][0]                 
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 100)       19360900    movie_input[0][0]                
__________________________________________________________________________________________________
reshape_1 

In [11]:
model.compile(loss='mse', optimizer='adamax')

In [12]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

In [13]:
callbacks = [EarlyStopping('val_loss', patience=2),
            ModelCheckpoint('movie_recommender_100k_trained.h5', save_best_only=True)]

history=model.fit([Users, Movies], Ratings, epochs=30, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 90752 samples, validate on 10084 samples
Epoch 1/30
 - 622s - loss: 1.6154 - val_loss: 0.8154
Epoch 2/30
 - 562s - loss: 0.7493 - val_loss: 0.7970
Epoch 3/30
 - 507s - loss: 0.7186 - val_loss: 0.7793
Epoch 4/30
 - 584s - loss: 0.6863 - val_loss: 0.7623
Epoch 5/30
 - 557s - loss: 0.6623 - val_loss: 0.7518
Epoch 6/30
 - 503s - loss: 0.6462 - val_loss: 0.7483
Epoch 7/30
 - 504s - loss: 0.6301 - val_loss: 0.7612
Epoch 8/30
 - 488s - loss: 0.6167 - val_loss: 0.7567


In [14]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 6 = 0.8650


In [15]:
from keras.models import load_model

In [16]:
trained_model = load_model("movie_recommender_100k_trained.h5")

In [39]:
def predict_rating(user_id, movie_id):
    return trained_model.predict([np.array([user_id - 1]), np.array([movie_id - 1])])[0][0]

In [40]:
userId = 150

In [41]:
user_ratings = ratings[ratings['userId'] == userId][['userId', 'movieId', 'rating']]
# user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(userId, x['movieId']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies,
                                                on='movieId',
                                                how='inner',
                                                suffixes=['_u', '_m'])

In [42]:
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(userId, x['movieId']), axis=1)

In [46]:
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies,
                                                on='movieId',
                                                how='inner',
                                                suffixes=['_u', '_m'])

Unnamed: 0,userId,movieId,rating,prediction,title,genres
0,150,1356,5.0,3.828905,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller
1,150,32,5.0,4.00231,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
2,150,141,5.0,3.424587,"Birdcage, The (1996)",Comedy
3,150,648,4.0,3.500291,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
4,150,6,4.0,3.946426,Heat (1995),Action|Crime|Thriller
5,150,25,4.0,3.689686,Leaving Las Vegas (1995),Drama|Romance
6,150,36,4.0,3.747801,Dead Man Walking (1995),Crime|Drama
7,150,52,4.0,3.601269,Mighty Aphrodite (1995),Comedy|Drama|Romance
8,150,805,4.0,3.667263,"Time to Kill, A (1996)",Drama|Thriller
9,150,780,4.0,3.604608,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller


In [63]:
recommendations = ratings[ratings['movieId'].isin(user_ratings['movieId']) == False][['movieId']].drop_duplicates()

In [64]:
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(userId, x['movieId']), axis=1)

In [65]:
recommendations = recommendations.sort_values(by='prediction',
                                             ascending=False).merge(movies,
                                                                   on='movieId',
                                                                   how='inner',
                                                                   suffixes=['_u','_m'])
recommendations = pd.merge(recommendations, links, on='movieId')

In [67]:
recommendations['imdbId'] = 'tt0' + recommendations['imdbId'].astype(str)

In [68]:
recommendations

Unnamed: 0,movieId,prediction,title,genres,imdbId,tmdbId
0,3451,4.474097,Guess Who's Coming to Dinner (1967),Drama,tt061735,1879.0
1,5747,4.441415,Gallipoli (1981),Drama|War,tt082432,11646.0
2,6460,4.370301,"Trial, The (Procès, Le) (1962)",Drama,tt057427,3009.0
3,177593,4.365219,"Three Billboards Outside Ebbing, Missouri (2017)",Crime|Drama,tt05027774,359940.0
4,904,4.361011,Rear Window (1954),Mystery|Thriller,tt047396,567.0
5,2360,4.356462,"Celebration, The (Festen) (1998)",Drama,tt0154420,309.0
6,2959,4.355269,Fight Club (1999),Action|Crime|Drama|Thriller,tt0137523,550.0
7,8132,4.345494,Gladiator (1992),Action|Drama,tt0104346,16219.0
8,170705,4.310242,Band of Brothers (2001),Action|Drama|War,tt0185906,331214.0
9,1178,4.301063,Paths of Glory (1957),Drama|War,tt050825,975.0


In [69]:
top5 = recommendations[0:5]

In [75]:
movie_list=[]
for i, row in top5.iterrows():
    movie_data = {
        'movie_id': row[0],
        'movie_title': row[2]
    }
    movie_list.append(movie_data)

movie_list

[{'movie_id': 3451, 'movie_title': "Guess Who's Coming to Dinner (1967)"},
 {'movie_id': 5747, 'movie_title': 'Gallipoli (1981)'},
 {'movie_id': 6460, 'movie_title': 'Trial, The (Procès, Le) (1962)'},
 {'movie_id': 177593,
  'movie_title': 'Three Billboards Outside Ebbing, Missouri (2017)'},
 {'movie_id': 904, 'movie_title': 'Rear Window (1954)'}]

In [84]:
for movie in movie_list:
    title = movie['movie_title'][:-7]
    if ', The' in title:
        split_title = title.split(', ')
        title = split_title[1] + ' ' + split_title[0]
    year = movie['movie_title'][-6:]
    year = year.replace('(', '').replace(')', '')

1967
1981
1962
2017
1954
