In [1]:
import sys
import subprocess
import pkg_resources

required = {'numpy', 'pandas', 'tensorflow', 'keras'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import subprocess

## Import dataset

hetrec2011-movielens-2k

https://files.grouplens.org/datasets/hetrec2011/hetrec2011-movielens-readme.txt

This dataset is an extension of MovieLens10M dataset, published by GroupLeans 
    research group.
    http://www.grouplens.org 

- the original Movielens dataset from GroupLens research group, http://www.grouplens.org
- IMDb website, http://www.imdb.com
- Rotten Tomatoes website, http://www.rottentomatoes.com

### - download data

In [3]:
DATASET_URL = 'https://files.grouplens.org/datasets/hetrec2011/hetrec2011-movielens-2k-v2.zip'
DATASET_PATH = './hetrec2011-movielens-2k-v2.zip'

# download and extract the data
subprocess.run(['wget', DATASET_URL])
subprocess.run(['unzip', f'{DATASET_PATH}'])
subprocess.run(['rm', 'hetrec2011-movielens-2k-v2.zip*'])
!ls

hetrec2011-movielens-2k-v2.zip	  movie_genres.dat
hetrec2011-movielens-2k-v2.zip.1  movie_locations.dat
hetrec2011-movielens-2k-v2.zip.2  movies.dat
hetrec2011-movielens-2k-v2.zip.3  movie_tags.dat
hetrec2011-movielens-2k-v2.zip.4  readme.txt
hetrec2011-movielens-2k-v2.zip.5  sample_data
hetrec2011-movielens-2k-v2.zip.6  tags.dat
hetrec2011-movielens-2k-v2.zip.7  user_ratedmovies.dat
movie_actors.dat		  user_ratedmovies-timestamps.dat
movie_countries.dat		  user_taggedmovies.dat
movie_directors.dat		  user_taggedmovies-timestamps.dat


### - open data

In [4]:
# read the dataframe
dataset = pd.read_csv('user_ratedmovies.dat', sep = '\t')
dataset

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,3,1.0,29,10,2006,23,17,16
1,75,32,4.5,29,10,2006,23,23,44
2,75,110,4.0,29,10,2006,23,30,8
3,75,160,2.0,29,10,2006,23,16,52
4,75,163,4.0,29,10,2006,23,29,30
...,...,...,...,...,...,...,...,...,...
855593,71534,44555,4.0,3,12,2007,3,5,38
855594,71534,46578,4.0,3,12,2007,2,56,44
855595,71534,48516,4.5,3,12,2007,2,53,46
855596,71534,61075,5.0,10,10,2008,9,56,5


### - apply preprocessing 

In [5]:
# drop redundant columns
columns_to_drop = ['date_month', 'date_hour',	'date_minute', 'date_second']
dataset = dataset.drop(columns_to_drop, axis = 1)

# remap userID
userIDs = dataset.userID.unique()
userIDs_dict = dict(zip(userIDs, range(len(userIDs))))
dataset['userID'] = dataset['userID'].replace(userIDs_dict)

# remap movieID
movieIDs = dataset.movieID.unique()
movieIDs_dict = dict(zip(movieIDs, range(len(movieIDs))))
dataset['movieID'] = dataset['movieID'].replace(movieIDs_dict)

dataset

Unnamed: 0,userID,movieID,rating,date_day,date_year
0,0,0,1.0,29,2006
1,0,1,4.5,29,2006
2,0,2,4.0,29,2006
3,0,3,2.0,29,2006
4,0,4,4.0,29,2006
...,...,...,...,...,...
855593,2112,1068,4.0,3,2007
855594,2112,504,4.0,3,2007
855595,2112,1196,4.5,3,2007
855596,2112,7487,5.0,10,2008


## Movie Recommendations NN  

### - train, test split 

In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=1)

### - define the model

In [7]:
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model

n_users = len(dataset.userID.unique())
n_movies = len(dataset.movieID.unique())

movie_input = Input(shape=[1], name="Movie-Input")
movie_embedding = Embedding(n_movies+1, 5, name="Movie-Embedding")(movie_input)
movie_vec = Flatten(name="Flatten-Movies")(movie_embedding)

user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(n_users+1, 5, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)

prod = Dot(name="Dot-Product", axes=1)([movie_vec, user_vec])
model = Model([user_input, movie_input], prod)
model.compile('adam', 'mean_squared_error')


### - train the model

In [9]:
history = model.fit([train.userID, train.movieID], train.rating, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### - make predictions

In [11]:
movie_data = np.array(list(set(dataset.movieID)))
user = np.array([1 for i in range(len(movie_data))])
predictions = model.predict([user, movie_data])
predictions = np.array([a[0] for a in predictions])
recommended_movie_ids = (-predictions).argsort()[:5]
print(recommended_movie_ids)
print(predictions[recommended_movie_ids])

[8137 9105 6261 6090 7839]
[5.185059  5.135773  5.111074  5.069954  5.0625067]
