In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# Loading Data

In this section, we load and preview the data.

There are three dataframes we are working with, movies, users, and ratings. 

All will be used to train the neural net.

In [2]:
ratings = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_ratings.csv').drop("Unnamed: 0", axis=1) 
ratings['user_emb_id'] = ratings['user_id'] - 1
ratings['movie_emb_id'] = ratings['movie_id'] - 1
# Set max_userid to the maximum user_id in the ratings
max_userid = ratings['user_id'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = ratings['movie_id'].drop_duplicates().max()

ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,user_emb_id,movie_emb_id
0,196,242,3,881250949.0,195,241
1,186,302,3,891717742.0,185,301
2,22,377,1,878887116.0,21,376
3,244,51,2,880606923.0,243,50
4,166,346,1,886397596.0,165,345


In [3]:
users = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_users.csv').drop("Unnamed: 0", axis=1)
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
# Reading movie file
i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('../matrix_factorization/data/ml-100k/u.item', sep='|', encoding='latin-1', names=i_cols)

movies.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Creating the training data

Here, all data is shuffled and a subset of each dataset is taken.

In [5]:
# Create training set
shuffled_ratings = ratings.sample(frac=1., random_state=42)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [268 576  42 ... 436 283 221] , shape = (100105,)
Movies: [ 527 1043   13 ...  474  321  199] , shape = (100105,)
Ratings: [4 4 2 ... 3 3 3] , shape = (100105,)


`CFModel` is a class that is created to package a neural network together. 

This class takes the parameters of how many users, items, and k_factors, important for creating a latent factors matrix.

In [6]:
# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
# Import CF Model Architecture
from CFModel import CFModel

Using TensorFlow backend.


In [7]:
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 150 # A random test user (user_id = 2500)

In [8]:
# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
# model.compile(loss='mse', optimizer='adamax')

In [9]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 1, 100)        94800       embedding_input_1[0][0]          
____________________________________________________________________________________________________
reshape_1 (Reshape)              (None, 100)           0           embedding_1[0][0]                
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 100)        168200      embedding_input_2[0][0]          
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 100)           0           embedding_2[0][0]                
Total params: 263000
______________________________________________________________________

In [10]:
model.compile(loss='mse', optimizer='adamax')

In [11]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 90094 samples, validate on 10011 samples
Epoch 1/30
16s - loss: 9.2182 - val_loss: 4.4780
Epoch 2/30
14s - loss: 3.1028 - val_loss: 2.3407
Epoch 3/30
14s - loss: 1.9272 - val_loss: 1.7375
Epoch 4/30
12s - loss: 1.5376 - val_loss: 1.4915
Epoch 5/30
10s - loss: 1.3657 - val_loss: 1.3743
Epoch 6/30
10s - loss: 1.2761 - val_loss: 1.3078
Epoch 7/30
11s - loss: 1.2244 - val_loss: 1.2701
Epoch 8/30
17s - loss: 1.1916 - val_loss: 1.2437
Epoch 9/30
25s - loss: 1.1677 - val_loss: 1.2232
Epoch 10/30
16s - loss: 1.1504 - val_loss: 1.2116
Epoch 11/30
17s - loss: 1.1384 - val_loss: 1.1965
Epoch 12/30
23s - loss: 1.1289 - val_loss: 1.1875
Epoch 13/30
17s - loss: 1.1216 - val_loss: 1.1830
Epoch 14/30
11s - loss: 1.1152 - val_loss: 1.1732
Epoch 15/30
15s - loss: 1.1106 - val_loss: 1.1685
Epoch 16/30
11s - loss: 1.1056 - val_loss: 1.1645
Epoch 17/30
12s - loss: 1.1024 - val_loss: 1.1610
Epoch 18/30
11s - loss: 1.0985 - val_loss: 1.1557
Epoch 19/30
15s - loss: 1.0967 - val_loss: 1.1557
Epoch 20/

In [13]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 30 = 1.0675


In [14]:
# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
# Load weights
trained_model.load_weights('weights.h5')

In [15]:
users[users['user_id'] == 947]

Unnamed: 0,user_id,age,sex,occupation,zip_code
946,947,25,M,scientist,78726


In [16]:
# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

In [17]:
user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

Unnamed: 0,user_id,movie_id,rating,prediction,movie title,release date,video release date,IMDb URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,150,181,5,4.131965,Return of the Jedi (1983),14-Mar-1997,,http://us.imdb.com/M/title-exact?Return%20of%2...,0,1,...,0,0,0,0,0,1,1,0,1,0
1,150,127,5,4.125523,"Godfather, The (1972)",01-Jan-1972,,"http://us.imdb.com/M/title-exact?Godfather,%20...",0,1,...,0,0,0,0,0,0,0,0,0,0
2,150,276,5,4.084824,Leaving Las Vegas (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Leaving%20Las...,0,0,...,0,0,0,0,0,1,0,0,0,0
3,150,50,5,4.119919,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...,0,1,...,0,0,0,0,0,1,1,0,1,0
4,150,246,5,4.091963,Chasing Amy (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Chasing+Amy+(...,0,0,...,0,0,0,0,0,1,0,0,0,0
5,150,268,5,4.106375,Chasing Amy (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Chasing+Amy+(...,0,0,...,0,0,0,0,0,1,0,0,0,0
6,150,475,5,4.09093,Trainspotting (1996),19-Jul-1996,,http://us.imdb.com/Title?Trainspotting+(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
7,150,410,4,4.111494,Kingpin (1996),12-Jul-1996,,http://us.imdb.com/M/title-exact?Kingpin%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
8,150,458,4,4.113533,Nixon (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Nixon%20(1995),0,0,...,0,0,0,0,0,0,0,0,0,0
9,150,293,4,4.055636,Donnie Brasco (1997),28-Feb-1997,,http://us.imdb.com/M/title-exact?Donnie%20Bras...,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)

Unnamed: 0,movie_id,prediction,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1392,4.316131,"Locusts, The (1997)",01-Jan-1997,,http://us.imdb.com/M/title-exact?Locusts%2C+Th...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1391,4.296916,For Ever Mozart (1996),04-Jul-1997,,http://us.imdb.com/M/title-exact?For+Ever+Moza...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1296,4.288038,Indian Summer (1996),01-Jan-1996,,http://us.imdb.com/M/title-exact?Indian+Summer...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1323,4.282354,"Wooden Man's Bride, The (Wu Kui) (1994)",01-Jan-1994,,http://us.imdb.com/M/title-exact?Wu%20Kui%20(1...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1394,4.282018,Swept from the Sea (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Swept+from+th...,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,1022,4.275234,"Fast, Cheap & Out of Control (1997)",01-Jan-1997,,"http://us.imdb.com/M/title-exact?Fast,+Cheap+&...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,992,4.272534,Head Above Water (1996),20-Jun-1997,,http://us.imdb.com/M/title-exact?Head+Above+Wa...,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,1560,4.271528,Clean Slate (Coup de Torchon) (1981),01-Jan-1981,,http://us.imdb.com/M/title-exact?Coup%20de%20t...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,119,4.268692,Maya Lin: A Strong Clear Vision (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Maya%20Lin:%2...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1351,4.268121,Lover's Knot (1996),12-Jul-1996,,http://us.imdb.com/M/title-exact?Lover's%20Kno...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
