# MovieLens Recommendation system

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
# Import dataset
movies = pd.read_csv('data/ml-latest-small/movies.csv', sep=',', engine='python', encoding='latin-1')
ratings = pd.read_csv('data/ml-latest-small/ratings.csv', sep=',', engine='python', encoding='latin-1')

In [3]:
# Merge ratigs with the corresponding movies
df = pd.merge(ratings, movies, how='left', on=['movieId']).sort_values(by=['timestamp'], ascending=True)

In [4]:
# Drop columns not needed
df = df.drop(columns=['title', 'genres'])

In [5]:
# Prepare training and test set
training_set = df[:int(len(df)*0.8)]
test_set = df[int(len(df)*0.8):]

In [6]:
# Convert to numpy array
training_set = np.array(training_set, dtype='int32')
test_set = np.array(test_set, dtype='int32')

In [7]:
# Get number of users and movies
unique_users = np.unique(np.concatenate([training_set[:, 0], test_set[:, 0]]))
nb_users = len(unique_users)

unqiue_movies = np.unique(np.concatenate([training_set[:, 1], test_set[:, 1]]))
nb_movies = len(unqiue_movies)

In [8]:
# Create pivot table
def convert(data):
    new_data = []
    for id_user in unique_users:
        id_movies = data[:, 1][data[:, 0] == id_user]
        id_ratings = data[:, 2][data[:, 0] == id_user]
        ratings = np.zeros(nb_movies)
        ratings[id_movies] = id_ratings
        new_data.append(ratings)
        
    return new_data

In [9]:
def convert(data, unique_movies):  
    # Create a dictionary for fast movie ID lookup
    movie_id_to_index = {movie_id: index for index, movie_id in enumerate(unique_movies)}

    new_data = []
    for id_user in np.unique(data[:, 0]):  # Ensure unique users
        id_movies = data[:, 1][data[:, 0] == id_user]
        id_ratings = data[:, 2][data[:, 0] == id_user]

        ratings = np.zeros(nb_movies)  # Use length of unique movies
        for movie_id, rating in zip(id_movies, id_ratings):
            index = movie_id_to_index.get(movie_id)  # Find index using dictionary
            if index is not None:  # Check if movie ID exists in unique_movies
                ratings[index] = rating

        new_data.append(ratings)

    return new_data

In [10]:
# Convert data to matrix
training_set = np.array(convert(training_set, unqiue_movies))
test_set = np.array(convert(test_set, unqiue_movies))

In [11]:
# Convert to torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [12]:
training_set

tensor([[4., 0., 4.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [4., 0., 0.,  ..., 0., 0., 0.],
        [2., 2., 2.,  ..., 0., 0., 0.],
        [3., 0., 0.,  ..., 0., 0., 0.]])

In [13]:
# Convert ratings to binary (liked or not liked)
training_set[training_set == 0] = -1
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0
training_set[training_set >= 3] = 1

test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

In [14]:
training_set

tensor([[ 1., -1.,  1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        ...,
        [ 1., -1., -1.,  ..., -1., -1., -1.],
        [ 0.,  0.,  0.,  ..., -1., -1., -1.],
        [ 1., -1., -1.,  ..., -1., -1., -1.]])

In [15]:
# Creating the architecture of the neural network
class RBM():
    def __init__(self, nv, nh):
        self.W = torch.randn(nh, nv) * 0.1  # Weight matrix of size (nh, nv)
        self.a = torch.zeros(1, nh)         # Bias for hidden units
        self.b = torch.zeros(1, nv)         # Bias for visible units
        
    def sample_h(self, x):
        wx = torch.mm(x, self.W.t())
        activation = wx + self.a.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    
    def sample_v(self, y):
        wy = torch.mm(y, self.W)
        activation = wy + self.b.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    
    def train(self, v0, vk, ph0, phk, lr=0.01):
        self.W += lr * (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        self.b += lr * torch.sum((v0 - vk), dim=0)
        self.a += lr * torch.sum((ph0 - phk), dim=0)

In [16]:
# Set the parameters for the model
nv = nb_movies
nh = 128
batch_size = 32
rbm =RBM(nv, nh)

In [19]:
training_set[0:32]

tensor([[ 1., -1.,  1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        ...,
        [ 1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.],
        [-1., -1., -1.,  ..., -1., -1., -1.]])

In [17]:
# Training the model
nb_epochs = 20

for epoch in range(1, nb_epochs + 1):
    train_loss = 0
    s = 0.0
    
    for i in range(0, len(unique_users), batch_size):
        vk = training_set[i:i + batch_size]
        v0 = training_set[i:i + batch_size]
        
        if vk.shape[0] != batch_size:
            continue
        
        ph0, _ = rbm.sample_h(v0)
        
        for k in range(10):
            _, hk = rbm.sample_h(vk)
            _, vk = rbm.sample_v(hk)
            vk[v0 < 0] = v0[v0 < 0]
        
        phk, _ = rbm.sample_h(vk)
        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(v0[v0 >= 0] - vk[v0 >= 0]))
        
        s += 1.0
        
    print('epoch:', str(epoch) + '  ' + 'loss:', str(train_loss/s))

epoch: 1  loss: tensor(0.3633)
epoch: 2  loss: tensor(0.2952)
epoch: 3  loss: tensor(0.2807)
epoch: 4  loss: tensor(0.2730)
epoch: 5  loss: tensor(0.2685)
epoch: 6  loss: tensor(0.2668)
epoch: 7  loss: tensor(0.2665)
epoch: 8  loss: tensor(0.2540)
epoch: 9  loss: tensor(0.2521)
epoch: 10  loss: tensor(0.2514)
epoch: 11  loss: tensor(0.2513)
epoch: 12  loss: tensor(0.2506)
epoch: 13  loss: tensor(0.2506)
epoch: 14  loss: tensor(0.2473)
epoch: 15  loss: tensor(0.2433)
epoch: 16  loss: tensor(0.2442)
epoch: 17  loss: tensor(0.2439)
epoch: 18  loss: tensor(0.2388)
epoch: 19  loss: tensor(0.2327)
epoch: 20  loss: tensor(0.2319)


In [18]:
# Evaluating the model on the test set
test_loss = 0
s = 0.0

for i in range(len(unique_users)):
    v = training_set[i:i + 1]
    vt = test_set[i:i + 1]
    
    if (len(vt[vt >= 0]) > 0):
        _, h = rbm.sample_h(v)
        _, v = rbm.sample_v(h)
        test_loss += torch.mean(torch.abs(vt[vt >= 0] - v[vt >= 0]))
        s += 1.0
    

print('loss:', str(test_loss/s))

loss: tensor(0.2979)
