In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# Loading Data

In this section, we load and preview the data.

There are three dataframes we are working with, movies, users, and ratings. 

All will be used to train the neural net.

In [2]:
ratings = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_ratings.csv').drop("Unnamed: 0", axis=1) 
ratings['user_emb_id'] = ratings['user_id'] - 1
ratings['movie_emb_id'] = ratings['movie_id'] - 1
# Set max_userid to the maximum user_id in the ratings
max_userid = ratings['user_id'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = ratings['movie_id'].drop_duplicates().max()

ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,user_emb_id,movie_emb_id
0,196,242,3,881250949.0,195,241
1,186,302,3,891717742.0,185,301
2,22,377,1,878887116.0,21,376
3,244,51,2,880606923.0,243,50
4,166,346,1,886397596.0,165,345


In [3]:
users = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_users.csv').drop("Unnamed: 0", axis=1)
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
# Reading movie file
i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('../matrix_factorization/data/ml-100k/u.item', sep='|', encoding='latin-1', names=i_cols)

movies.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Creating the training data

Here, all data is shuffled and a subset of each dataset is taken.

In [5]:
# Create training set
shuffled_ratings = ratings.sample(frac=1., random_state=42)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [884 392  89 ... 436 283 221] , shape = (100010,)
Movies: [659 375 480 ... 474 321 199] , shape = (100010,)
Ratings: [5 4 5 ... 3 3 3] , shape = (100010,)


`CFModel` is a class that is created to package a neural network together. 

This class takes the parameters of how many users, items, and k_factors, important for creating a latent factors matrix.

In [6]:
# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
# Import CF Model Architecture
from CFModel import CFModel

Using TensorFlow backend.


In [7]:
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 150 # A random test user (user_id = 2500)

In [8]:
# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
# model.compile(loss='mse', optimizer='adamax')

In [9]:
model

<CFModel.CFModel at 0xc1b308dd8>

In [10]:
model.compile(loss='mse', optimizer='adamax')

In [11]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 90009 samples, validate on 10001 samples
Epoch 1/30
14s - loss: 9.2326 - val_loss: 4.4842
Epoch 2/30
12s - loss: 3.0988 - val_loss: 2.3295
Epoch 3/30
14s - loss: 1.9231 - val_loss: 1.7256
Epoch 4/30
14s - loss: 1.5345 - val_loss: 1.4866
Epoch 5/30
17s - loss: 1.3641 - val_loss: 1.3697
Epoch 6/30
16s - loss: 1.2754 - val_loss: 1.3039
Epoch 7/30
14s - loss: 1.2227 - val_loss: 1.2631
Epoch 8/30
13s - loss: 1.1905 - val_loss: 1.2407
Epoch 9/30
14s - loss: 1.1665 - val_loss: 1.2191
Epoch 10/30
14s - loss: 1.1507 - val_loss: 1.2055
Epoch 11/30
14s - loss: 1.1385 - val_loss: 1.1917
Epoch 12/30
11s - loss: 1.1296 - val_loss: 1.1829
Epoch 13/30
13s - loss: 1.1216 - val_loss: 1.1785
Epoch 14/30
11s - loss: 1.1149 - val_loss: 1.1723
Epoch 15/30
11s - loss: 1.1102 - val_loss: 1.1655
Epoch 16/30
11s - loss: 1.1057 - val_loss: 1.1618
Epoch 17/30
11s - loss: 1.1022 - val_loss: 1.1591
Epoch 18/30
11s - loss: 1.0992 - val_loss: 1.1533
Epoch 19/30
11s - loss: 1.0966 - val_loss: 1.1518
Epoch 20/

In [12]:
history

<keras.callbacks.History at 0xc1b65f668>

In [13]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 30 = 1.0667


In [14]:
# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
# Load weights
trained_model.load_weights('weights.h5')

In [15]:
users[users['user_id'] == TEST_USER]

Unnamed: 0,user_id,age,sex,occupation,zip_code
149,150,20,F,artist,2139


In [16]:
# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

In [17]:
user_ratings = ratings[ratings['user_id'] == TEST_USER][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

Unnamed: 0,user_id,movie_id,rating,prediction,movie title,release date,video release date,IMDb URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,150,181,5,4.125782,Return of the Jedi (1983),14-Mar-1997,,http://us.imdb.com/M/title-exact?Return%20of%2...,0,1,...,0,0,0,0,0,1,1,0,1,0
1,150,127,5,4.080724,"Godfather, The (1972)",01-Jan-1972,,"http://us.imdb.com/M/title-exact?Godfather,%20...",0,1,...,0,0,0,0,0,0,0,0,0,0
2,150,276,5,3.95113,Leaving Las Vegas (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Leaving%20Las...,0,0,...,0,0,0,0,0,1,0,0,0,0
3,150,50,5,4.069466,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...,0,1,...,0,0,0,0,0,1,1,0,1,0
4,150,246,5,4.083027,Chasing Amy (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Chasing+Amy+(...,0,0,...,0,0,0,0,0,1,0,0,0,0
5,150,268,5,4.037043,Chasing Amy (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Chasing+Amy+(...,0,0,...,0,0,0,0,0,1,0,0,0,0
6,150,475,5,4.058831,Trainspotting (1996),19-Jul-1996,,http://us.imdb.com/Title?Trainspotting+(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
7,150,410,4,4.107251,Kingpin (1996),12-Jul-1996,,http://us.imdb.com/M/title-exact?Kingpin%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
8,150,458,4,4.103925,Nixon (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Nixon%20(1995),0,0,...,0,0,0,0,0,0,0,0,0,0
9,150,293,4,4.075466,Donnie Brasco (1997),28-Feb-1997,,http://us.imdb.com/M/title-exact?Donnie%20Bras...,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(TEST_USER, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)

Unnamed: 0,movie_id,prediction,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1555,4.387218,"Secret Adventures of Tom Thumb, The (1993)",01-Jan-1993,,http://us.imdb.com/M/title-exact?Secret%20Adve...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1484,4.324187,"Jerky Boys, The (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Jerky%20Boys,...",0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1529,4.320921,Underground (1995),29-Mar-1996,,http://us.imdb.com/M/title-exact?Underground%2...,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1401,4.320077,M. Butterfly (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?M.%20Butterfl...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1158,4.30828,"Fille seule, La (A Single Girl) (1995)",30-Oct-1996,,http://us.imdb.com/M/title-exact?Fille%20seule...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1353,4.299327,1-900 (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?06%20(1994),0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,1612,4.295487,"Leading Man, The (1996)",16-Jan-1998,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,936,4.29519,Brassed Off (1996),13-Jun-1997,,http://us.imdb.com/M/title-exact?Brassed%20Off...,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,1470,4.285138,Gumby: The Movie (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Gumby:%20The%...,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,1262,4.282104,Walking and Talking (1996),12-Jul-1996,,http://us.imdb.com/M/title-exact?Walking%20and...,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
