## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

# Analyze the Datasets

In [2]:
movies = pd.read_csv('../Datasets/Movies/ml-1m/movies.dat', sep='::', header=None, engine='python', encoding='latin-1')
movies.columns = ["Id", "Title", "Category"]
movies.head()

Unnamed: 0,Id,Title,Category
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
users = pd.read_csv('../Datasets/Movies/ml-1m/users.dat', sep='::', header=None, engine='python', encoding='latin-1')
users.columns = ["Id", "Gender", "Age", "Job", "Code"]
users.head()

Unnamed: 0,Id,Gender,Age,Job,Code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
ratings = pd.read_csv('../Datasets/Movies/ml-1m/ratings.dat', sep='::', header=None, engine='python', encoding='latin-1')
ratings.columns = ["UserId", "MovieId", "Rating", "DateTime"]
ratings.head()

Unnamed: 0,UserId,MovieId,Rating,DateTime
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# Data Preprocessing

## Prepare Training and Test Set

In [5]:
training_set_df = pd.read_csv('../Datasets/Movies/ml-100k/u1.base', delimiter='\t')
training_set_df.columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
training_set_df.head()

Unnamed: 0,UserId,MovieId,Rating,TimeStamp
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561


In [6]:
training_set = np.array(training_set_df, dtype='int')

In [7]:
test_set_df = pd.read_csv('../Datasets/Movies/ml-100k/u1.test', delimiter='\t')
test_set_df.columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
test_set_df.head()

Unnamed: 0,UserId,MovieId,Rating,TimeStamp
0,1,10,3,875693118
1,1,12,5,878542960
2,1,14,5,874965706
3,1,17,3,875073198
4,1,20,4,887431883


In [8]:
test_set = np.array(test_set_df, dtype='int')

## Get total number of Users and Movies

In [9]:
n_total_users = max(max(training_set_df["UserId"].values), max(test_set_df["UserId"].values))
n_total_movies = max(max(training_set_df["MovieId"].values), max(test_set_df["MovieId"].values))
print("total users:", n_total_users, ",total movies:", n_total_movies) 

total users: 943 ,total movies: 1682


## Convert Data into Matrix with users in rows and movies in columns

In [10]:
def convert_to_matrix(data, total_users, total_movies):
    ratings_data = np.zeros(shape=(total_users, total_movies))
    for user_id, movie_id, rating, timestamp in data:
        ratings_data[user_id-1][movie_id-1] = rating
    return ratings_data

In [11]:
training_set = convert_to_matrix(training_set, n_total_users, n_total_movies)
test_set = convert_to_matrix(test_set, n_total_users, n_total_movies)

## Convert Data into Torch Tensors

In [12]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

# Build Auto Encoder

In [13]:
class StackedAutoEncoder(nn.Module):
    def __init__(self):
        super(StackedAutoEncoder, self).__init__()
        self.full_connection_1 = nn.Linear(n_total_movies, 20)
        self.full_connection_2 = nn.Linear(20, 10)
        self.full_connection_3 = nn.Linear(10, 20)
        self.full_connection_4 = nn.Linear(20, n_total_movies)
        self.activation = nn.Sigmoid()
        
    def forward(self, x):
        x = self.activation(self.full_connection_1(x))
        x = self.activation(self.full_connection_2(x))
        x = self.activation(self.full_connection_3(x))
        x = self.full_connection_4(x)
        return x

# Train Auto Encoder

In [14]:
stacked_auto_encoder = StackedAutoEncoder()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(stacked_auto_encoder.parameters(), lr = 0.01, weight_decay=0.5)

In [15]:
nb_epochs = 200
for epoch in range(nb_epochs):
    train_loss = 0
    counter = 0.
    for user_id in range(n_total_users):
        movie_ratings_for_user = Variable(training_set[user_id]).unsqueeze(0) # batch of single input vector
        target = movie_ratings_for_user.clone()
        if torch.sum(target.data > 0) > 0:
            sample_output = stacked_auto_encoder(movie_ratings_for_user)
            target.require_grad = False # do not compute gradient
            sample_output[target == 0] = 0 # make sure unrated movies do not impact learning and computation
            loss = criterion(sample_output, target)
            mean_corrector = n_total_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data*mean_corrector)
            counter += 1.
            optimizer.step()
    print("Epoch:", epoch, " Loss:", train_loss/counter)

Epoch: 0  Loss: tensor(1.7712)
Epoch: 1  Loss: tensor(1.0967)
Epoch: 2  Loss: tensor(1.0533)
Epoch: 3  Loss: tensor(1.0385)
Epoch: 4  Loss: tensor(1.0307)
Epoch: 5  Loss: tensor(1.0266)
Epoch: 6  Loss: tensor(1.0236)
Epoch: 7  Loss: tensor(1.0220)
Epoch: 8  Loss: tensor(1.0208)
Epoch: 9  Loss: tensor(1.0197)
Epoch: 10  Loss: tensor(1.0186)
Epoch: 11  Loss: tensor(1.0183)
Epoch: 12  Loss: tensor(1.0178)
Epoch: 13  Loss: tensor(1.0175)
Epoch: 14  Loss: tensor(1.0172)
Epoch: 15  Loss: tensor(1.0170)
Epoch: 16  Loss: tensor(1.0167)
Epoch: 17  Loss: tensor(1.0165)
Epoch: 18  Loss: tensor(1.0164)
Epoch: 19  Loss: tensor(1.0163)
Epoch: 20  Loss: tensor(1.0158)
Epoch: 21  Loss: tensor(1.0158)
Epoch: 22  Loss: tensor(1.0158)
Epoch: 23  Loss: tensor(1.0159)
Epoch: 24  Loss: tensor(1.0158)
Epoch: 25  Loss: tensor(1.0156)
Epoch: 26  Loss: tensor(1.0152)
Epoch: 27  Loss: tensor(1.0154)
Epoch: 28  Loss: tensor(1.0129)
Epoch: 29  Loss: tensor(1.0115)
Epoch: 30  Loss: tensor(1.0108)
Epoch: 31  Loss: t

# Test Auto Encoder

In [16]:
test_loss = 0
counter = 0
for user_id in range(n_total_users):
    movie_ratings_for_user = Variable(training_set[user_id]).unsqueeze(0) # batch of single input vector
    target = Variable(test_set[user_id]).unsqueeze(0)
    if torch.sum(target.data > 0) > 0:
        sample_output = stacked_auto_encoder(movie_ratings_for_user)
        target.require_grad = False # do not compute gradient
        sample_output[target == 0] = 0 # make sure unrated movies do not impact learning and computation
        loss = criterion(sample_output, target)
        mean_corrector = n_total_movies/float(torch.sum(target.data > 0) + 1e-10)
        test_loss += np.sqrt(loss.data*mean_corrector)
        counter += 1.
print("Test loss:", test_loss/counter)

Test loss: tensor(0.9468)
