In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# Loading Data

In this section, we load and preview the data.

There are three dataframes we are working with, movies, users, and ratings. 

All will be used to train the neural net.

In [2]:
# Reading ratings file
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', 
                      usecols=['user_id', 'movie_id', 'user_emb_id', 'movie_emb_id', 'rating'])
max_userid = ratings['user_id'].drop_duplicates().max()
max_movieid = ratings['movie_id'].drop_duplicates().max()

# Reading ratings file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', 
                    usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading ratings file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', 
                     usecols=['movie_id', 'title', 'genres'])

In [3]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,user_emb_id,movie_emb_id
0,1,1193,5,0,1192
1,1,661,3,0,660
2,1,914,3,0,913
3,1,3408,4,0,3407
4,1,2355,5,0,2354


In [4]:
users.head()

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
0,1,F,48067,Under 18,K-12 student
1,2,M,70072,56+,self-employed
2,3,M,55117,25-34,scientist
3,4,M,2460,45-49,executive/managerial
4,5,M,55455,25-34,writer


In [49]:
movies.head(20)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [52]:
movies['title'][movies['movie_id']==433]

429    Clean Slate (1994)
Name: title, dtype: object

In [40]:
movies['title'].sample(n=20)

429                             Clean Slate (1994)
3227                       To Sir with Love (1967)
1723                     Further Gesture, A (1996)
2915                          On Any Sunday (1971)
2490                        King and I, The (1999)
3069                          Stealing Home (1988)
645                          Superweib, Das (1996)
5                                      Heat (1995)
3173                               Santitos (1997)
2945                          Bustin' Loose (1981)
2263                                  Belly (1998)
3394                            Last Resort (1994)
937                            East of Eden (1955)
2942         They Shoot Horses, Don't They? (1969)
1308                  Amityville Curse, The (1990)
3647                           Fatal Beauty (1987)
3717                  But I'm a Cheerleader (1999)
1154    Best of the Best 3: No Turning Back (1995)
1144                          Six of a Kind (1934)
2434                       Appl

# Creating the training data

Here, all data is shuffled and a subset of each dataset is taken.

In [28]:
# Create training set
shuffled_ratings = ratings.sample(frac=1., random_state=42)

# Shuffling users
Users = shuffled_ratings['user_emb_id'].values
print('Users:', Users, ', shape =', Users.shape)

# Shuffling movies
Movies = shuffled_ratings['movie_emb_id'].values
print('Movies:', Movies, ', shape =', Movies.shape)

# Shuffling ratings
Ratings = shuffled_ratings['rating'].values
print('Ratings:', Ratings, ', shape =', Ratings.shape)

Users: [5411 5439  367 ...  853 4032  785] , shape = (1000209,)
Movies: [2682  903 3716 ... 3101 3478 1390] , shape = (1000209,)
Ratings: [2 5 4 ... 3 5 4] , shape = (1000209,)


`CFModel` is a class that is created to package a neural network together. 

This class takes the parameters of how many users, items, and k_factors, important for creating a latent factors matrix.

In [7]:
# Import Keras libraries
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
# Import CF Model Architecture
from CFModel import CFModel

Using TensorFlow backend.


In [8]:
K_FACTORS = 100 # The number of dimensional embeddings for movies and users
TEST_USER = 2500 # A random test user (user_id = 2500)

In [9]:
# Define model
model = CFModel(max_userid, max_movieid, K_FACTORS)
# Compile the model using MSE as the loss function and the AdaMax learning algorithm
# model.compile(loss='mse', optimizer='adamax')

In [10]:
model

<CFModel.CFModel at 0x111c02668>

In [11]:
model.compile(loss='mse', optimizer='adamax')

In [14]:
# Callbacks monitor the validation loss
# Save the model weights each time the validation loss has improved
callbacks = [EarlyStopping('val_loss', patience=2), 
             ModelCheckpoint('weights.h5', save_best_only=True)]

# Use 30 epochs, 90% training data, 10% validation data 
history = model.fit([Users, Movies], Ratings, nb_epoch=30, validation_split=.1, verbose=2, callbacks=callbacks)

Train on 900188 samples, validate on 100021 samples
Epoch 1/30
343s - loss: 1.0798 - val_loss: 1.0867
Epoch 2/30
344s - loss: 1.0799 - val_loss: 1.0871
Epoch 3/30
344s - loss: 1.0796 - val_loss: 1.0876
Epoch 4/30
343s - loss: 1.0795 - val_loss: 1.0864
Epoch 5/30
344s - loss: 1.0793 - val_loss: 1.0866
Epoch 6/30
344s - loss: 1.0790 - val_loss: 1.0857
Epoch 7/30
343s - loss: 1.0790 - val_loss: 1.0866
Epoch 8/30
344s - loss: 1.0791 - val_loss: 1.0858
Epoch 9/30
341s - loss: 1.0789 - val_loss: 1.0854
Epoch 10/30
345s - loss: 1.0789 - val_loss: 1.0868
Epoch 11/30
343s - loss: 1.0789 - val_loss: 1.0866
Epoch 12/30
345s - loss: 1.0788 - val_loss: 1.0871


In [15]:
history

<keras.callbacks.History at 0x110b38860>

In [17]:
# Show the best validation RMSE
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print('Minimum RMSE at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(math.sqrt(min_val_loss)))

Minimum RMSE at epoch 9 = 1.0418


In [19]:
# Use the pre-trained model
trained_model = CFModel(max_userid, max_movieid, K_FACTORS)
# Load weights
trained_model.load_weights('weights.h5')

In [23]:
users[users['user_id'] == 2500]

Unnamed: 0,user_id,gender,zipcode,age_desc,occ_desc
2499,2500,M,29672,56+,academic/educator


In [21]:
# Function to predict the ratings given User ID and Movie ID
def predict_rating(user_id, movie_id):
    return trained_model.rate(user_id - 1, movie_id - 1)

In [24]:
user_ratings = ratings[ratings['user_id'] == 2500][['user_id', 'movie_id', 'rating']]
user_ratings['prediction'] = user_ratings.apply(lambda x: predict_rating(2500, x['movie_id']), axis=1)
user_ratings.sort_values(by='rating', 
                         ascending=False).merge(movies, 
                                                on='movie_id', 
                                                how='inner', 
                                                suffixes=['_u', '_m']).head(20)

Unnamed: 0,user_id,movie_id,rating,prediction,title,genres
0,2500,380,5,3.671851,True Lies (1994),Action|Adventure|Comedy|Romance
1,2500,3753,5,3.73586,"Patriot, The (2000)",Action|Drama|War
2,2500,349,5,3.739246,Clear and Present Danger (1994),Action|Adventure|Thriller
3,2500,1610,5,3.630476,"Hunt for Red October, The (1990)",Action|Thriller
4,2500,920,5,3.642075,Gone with the Wind (1939),Drama|Romance|War
5,2500,1294,5,3.653224,M*A*S*H (1970),Comedy|War
6,2500,204,5,3.366595,Under Siege 2: Dark Territory (1995),Action
7,2500,457,5,3.654633,"Fugitive, The (1993)",Action|Thriller
8,2500,2028,5,3.54983,Saving Private Ryan (1998),Action|Drama|War
9,2500,480,4,3.661703,Jurassic Park (1993),Action|Adventure|Sci-Fi


In [27]:
recommendations = ratings[ratings['movie_id'].isin(user_ratings['movie_id']) == False][['movie_id']].drop_duplicates()
recommendations['prediction'] = recommendations.apply(lambda x: predict_rating(2500, x['movie_id']), axis=1)
recommendations.sort_values(by='prediction',
                          ascending=False).merge(movies,
                                                 on='movie_id',
                                                 how='inner',
                                                 suffixes=['_u', '_m']).head(20)

Unnamed: 0,movie_id,prediction,title,genres
0,3687,4.046495,Light Years (1988),Sci-Fi
1,2264,3.996388,We're No Angels (1989),Drama
2,404,3.98685,Brother Minister: The Assassination of Malcolm...,Documentary
3,2847,3.985195,Only Angels Have Wings (1939),Drama
4,1561,3.976489,Wedding Bell Blues (1996),Comedy
5,1164,3.973804,Two or Three Things I Know About Her (1966),Drama
6,3131,3.968688,Broadway Damage (1997),Comedy
7,2197,3.958971,Firelight (1997),Drama
8,2480,3.953421,Dry Cleaning (Nettoyage à sec) (1997),Drama
9,2945,3.940659,Mike's Murder (1984),Mystery
