# Collaborative Filtering
Uses Deep learning to predict movie ratings based on user and movie embeddings, and bias.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import regularizers
from tensorflow.keras import metrics
from tensorflow.keras.utils import plot_model

In [2]:
# Get the data

# Read movies data as csv
movies = pd.read_csv('C:/Users/Ehi/Downloads/data/movie_lens/ml-latest-small/movies.csv')

# Read Ratings data as csv
ratings = pd.read_csv('C:/Users/Ehi/Downloads/data/movie_lens/ml-latest-small/ratings.csv')

### Pre-processing

In [3]:
# k variable holds the number of top "k" users and movies
k = 15

# count the number of ratings by each user
g = ratings.groupby('userId')['rating'].count()
# select the top "k" users
top_users = g.sort_values(ascending=False)[:k]

# groups the ratings dataframe by movieId
g = ratings.groupby('movieId')['rating'].count()

# Selects the top "k" movies with the highest number of ratings
top_movies = g.sort_values(ascending=False)[:k]

# Join the ratings dataframe with the top_users and top_movies dataframe
top_r = ratings.join(top_users, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(top_movies, rsuffix='_r', how='inner', on='movieId')

In [4]:
top_r.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_r,rating_r.1
10360,68,1,2.5,1158531426,1260,215
26092,182,1,4.0,1063289621,977,215
36374,249,1,4.0,1347317775,1046,215
39229,274,1,4.0,1171410158,1346,215
42114,288,1,4.5,1054568869,1055,215


In [5]:
# Encode the relevant data

# Encode the userId column
user_enc = LabelEncoder()
ratings['user'] = user_enc.fit_transform(ratings.userId.values)
n_users = ratings['user'].nunique()

# Encode the movieId column
item_enc = LabelEncoder()
ratings['movie'] = user_enc.fit_transform(ratings.movieId.values)
n_movies = ratings['movie'].nunique()

In [6]:
# convert the rating column to float
ratings['rating'] = ratings['rating'].values.astype(np.float32)

In [7]:
# Find the minimum and maximum rating
min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])

### Deep learning model

In [8]:
# Number of latent factors
emb_sz = 50

In [9]:
# Define user embeddings

# Input layer
user = layers.Input(shape=(1,))

# Embedding layer
user_emb = layers.Embedding(n_users, emb_sz, embeddings_regularizer=regularizers.l2(1e-6))(user)

# Reshape layer to flatten the embedding vector.
user_emb = layers.Reshape((emb_sz,))(user_emb)

In [10]:
# Define user bias

# Embedding layer
user_bias = layers.Embedding(n_users, 1, embeddings_regularizer=regularizers.l2(1e-6))(user)

# Reshape layer
user_bias = layers.Reshape((1,))(user_bias)

In [11]:
# Define movie embeddings

# Input layer
movie = layers.Input(shape=(1,))

# Embedding layer
movie_emb = layers.Embedding(n_movies, emb_sz, embeddings_regularizer=regularizers.l2(1e-6))(movie)

# Reshape layer
movie_emb = layers.Reshape((emb_sz,))(movie_emb)

In [12]:
# Define movie bias

# Embedding layer
movie_bias = layers.Embedding(n_movies, 1, embeddings_regularizer=regularizers.l2(1e-6))(movie)

# Reshape layer
movie_bias = layers.Reshape((1,))(movie_bias)

In [13]:
# Get predicted ratings
rating = layers.Concatenate()([user_emb, movie_emb])

In [14]:
# Add biases to the ratings

# Adds the user and movie embeddig to the dot product of the embeddings
rating = layers.Add()([rating, user_bias, movie_bias])

# Activation function
rating = layers.Dense(10, activation='relu')(rating)

# Applies dropout regularization to the ratings to reduce overfitting
rating = layers.Dropout(0.5)(rating)

# Produces a rating between 0 and 1
rating = layers.Dense(1, activation='sigmoid')(rating)

# Scales the predicted ratings to a range of 1 - 5
rating = layers.Lambda(lambda x:x*(max_rating - min_rating) + min_rating)(rating)

### Train and evaluate the model

In [15]:
# Model
model = models.Model([user, movie], rating)

# Compile the model
model.compile(loss='mse',  metrics=metrics.RootMeanSquaredError(),
              optimizer=optimizers.Adam(lr=0.001))



In [16]:
# split data into a trai and test dataset.

X = ratings[['user', 'movie']].values
y = ratings['rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(90752, 2) (90752,)
(10084, 2) (10084,)


In [17]:
# Train the model

model.fit(x=[X_train[:,0], X_train[:,1]], y=y_train,
          batch_size=64, epochs=5, verbose=1,
          validation_data=([X_test[:,0], X_test[:,1]], y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2666c48e2e0>

In conclusion, we can say that our model predicts user ratings with 88% accuracy, using RMSE evaluation metric.