# MovieLens Recommendation system

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
# Import dataset
movies = pd.read_csv('data/ml-latest-small/movies.csv', sep=',', engine='python', encoding='latin-1')
ratings = pd.read_csv('data/ml-latest-small/ratings.csv', sep=',', engine='python', encoding='latin-1')

In [3]:
# Merge ratigs with the corresponding movies
df = pd.merge(ratings, movies, how='left', on=['movieId']).sort_values(by=['timestamp'], ascending=True)

In [4]:
# Drop columns not needed
df = df.drop(columns=['title', 'genres'])

In [5]:
# Prepare training and test set
training_set = df[:int(len(df)*0.8)]
test_set = df[int(len(df)*0.8):]

In [6]:
# Convert to numpy array
training_set = np.array(training_set, dtype='int32')
test_set = np.array(test_set, dtype='int32')

In [7]:
# Get number of users and movies
unique_users = np.unique(np.concatenate([training_set[:, 0], test_set[:, 0]]))
nb_users = len(unique_users)

unqiue_movies = np.unique(np.concatenate([training_set[:, 1], test_set[:, 1]]))
nb_movies = len(unqiue_movies)

In [8]:
# Create pivot table
def convert(data):
    new_data = []
    for id_user in unique_users:
        id_movies = data[:, 1][data[:, 0] == id_user]
        id_ratings = data[:, 2][data[:, 0] == id_user]
        ratings = np.zeros(nb_movies)
        ratings[id_movies] = id_ratings
        new_data.append(ratings)
        
    return new_data

In [9]:
def convert(data, unique_movies):  
    # Create a dictionary for fast movie ID lookup
    movie_id_to_index = {movie_id: index for index, movie_id in enumerate(unique_movies)}

    new_data = []
    for id_user in np.unique(data[:, 0]):  # Ensure unique users
        id_movies = data[:, 1][data[:, 0] == id_user]
        id_ratings = data[:, 2][data[:, 0] == id_user]

        ratings = np.zeros(nb_movies)  # Use length of unique movies
        for movie_id, rating in zip(id_movies, id_ratings):
            index = movie_id_to_index.get(movie_id)  # Find index using dictionary
            if index is not None:  # Check if movie ID exists in unique_movies
                ratings[index] = rating

        new_data.append(ratings)

    return new_data

In [10]:
# Convert data to matrix
training_set = np.array(convert(training_set, unqiue_movies))
test_set = np.array(convert(test_set, unqiue_movies))

In [11]:
# Convert to torch tensors
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [12]:
# Creating the architecture of the neural network
class SAE(nn.Module): # Stacked AutoEncoder
    def __init__(self):
        super(SAE, self).__init__()
        
        self.fc1 = nn.Linear(nb_movies, 50)
        self.fc2 = nn.Linear(50, 20)
        self.fc3 = nn.Linear(20, 50)
        self.fc4 = nn.Linear(50, nb_movies)
        self.activation = nn.Sigmoid()
        
    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.fc4(x)
        
        return x

In [13]:
# Set the parameters for the model
sae =SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr=0.001)

In [14]:
# Training the model
nb_epochs = 50
batch_size = 32

for epoch in range(1, nb_epochs + 1):
    train_loss = 0
    s = 0.0
    
    for i in range(0, len(unique_users[:522]), batch_size):
        input = Variable(training_set[i:i + batch_size]).unsqueeze(0)
        target = input.clone()
        
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.item()*mean_corrector)
            s += 1.0
            optimizer.step()
            
    print('epoch:', str(epoch) + '  ' + 'loss:', str(train_loss/s))          

epoch: 1  loss: 0.39746998646864323
epoch: 2  loss: 0.26873080014134604
epoch: 3  loss: 0.22992592893011515
epoch: 4  loss: 0.2358830463482746
epoch: 5  loss: 0.223013292874028
epoch: 6  loss: 0.19740250247201127
epoch: 7  loss: 0.2077238646662665
epoch: 8  loss: 0.1907464091764003
epoch: 9  loss: 0.18692997141999904
epoch: 10  loss: 0.19469668984028018
epoch: 11  loss: 0.18653370335591765
epoch: 12  loss: 0.18045245886976263
epoch: 13  loss: 0.17580514243188106
epoch: 14  loss: 0.1698338919882279
epoch: 15  loss: 0.17436662791594357
epoch: 16  loss: 0.1784528402658893
epoch: 17  loss: 0.17553224647834442
epoch: 18  loss: 0.17147097168414321
epoch: 19  loss: 0.16724413001901312
epoch: 20  loss: 0.16487642131976282
epoch: 21  loss: 0.16626018375470544
epoch: 22  loss: 0.1664031023642253
epoch: 23  loss: 0.1673163682089325
epoch: 24  loss: 0.16637239011982383
epoch: 25  loss: 0.16514152058108822
epoch: 26  loss: 0.1640755747826646
epoch: 27  loss: 0.16295949310788296
epoch: 28  loss: 0.1

In [15]:
# Evaluating the model on the test set
test_loss = 0
s = 0.0

for i in range(0, len(unique_users[:522])):
    input = Variable(training_set[i]).unsqueeze(0)
    target = input.clone()
    
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.item()*mean_corrector)
        s += 1.0
            

print('test loss:', str(test_loss/s))

test loss: 0.9047637107726776
