Import packages

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

### review source data

In [2]:
##import dataset
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [3]:
movies

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
users

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,06810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


import train and test sets

In [5]:
ratings

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [6]:
##create training and test set data
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')
##convert it to array
training_set = np.array(training_set, dtype = 'int')

test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')
##convert it to array
test_set = np.array(test_set, dtype = 'int')

In [7]:
training_set

array([[        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       [        1,         4,         3, 876893119],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]])

In [8]:
test_set

array([[        1,        10,         3, 875693118],
       [        1,        12,         5, 878542960],
       [        1,        14,         5, 874965706],
       ...,
       [      459,       934,         3, 879563639],
       [      460,        10,         3, 882912371],
       [      462,       682,         5, 886365231]])

## Data structure creation

get movie and user total count

In [9]:
#take max users id in train and test data
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [10]:
nb_users, nb_movies

(943, 1682)

In [11]:
def convert(data):
    new_data = []
    for id_users in range(1, nb_users + 1):
        ##id of movies that is rated by current users
        id_movies = data[:,1][data[:,0] == id_users]
        
        ##rate of movies that is given by current user
        id_ratings = data[:,2][data[:,0] == id_users]
        
        #inialize ratings for all movies
        #set 0 for movies that are not rated by current users
        ratings = np.zeros(nb_movies)
        #movie id starts from 1, 1st movie will be 1st element in rating with index as 0
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data


In [12]:
training_set = convert(training_set)
test_set = convert(test_set)

convert the data into Torch tensor

In [13]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [14]:
training_set

tensor([[0., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])

In [15]:
test_set

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

## SAE architecture creation

In [16]:
class SAE(nn.Module):
    def __init__(self, ):
        #allow to inhert all classes and methods of parent class
        super(SAE, self).__init__()
        #num of features from input: num of movies, 20 nodes in first hidden layer
        self.fc1 = nn.Linear(nb_movies, 20)
        self.fc2 = nn.Linear(20, 10)
        #start to decoding
        self.fc3 = nn.Linear(10, 20)
        self.fc4 = nn.Linear(20, nb_movies)
        self.activation = nn.Sigmoid()
    #input vector (movies rating) for a specific users
    def forward(self, x):
        #apply activaton fuc on first encoding layer
        #return first encoded vector
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        #no need to activate the vector
        #output the reconstrctured vector
        x = self.fc4(x)
        return x

## model training

In [17]:
sae = SAE()
#Create loss fucn object
criterion = nn.MSELoss()
#create optimizer object
#parameters of all auto-encoders defined in the class
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)

In [18]:
# #train the SAE using pytorch only codes

#loop all epochs
nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    #exclude users who did not rate any movies
    #define a float
    s = 0.
    #loop through each users
    for id_user in range(nb_users):
        #get all rating for current user from training_set
        #nn does not take single dimension vector, so add a batch dimension
        #a batch of sinlge inptu vector, update weigths after each vector
        input = Variable(training_set[id_user]).unsqueeze(0)
        #create target by copying input
        target = input.clone()
        #only look at users who rated at least 1 movie
        if torch.sum(target.data > 0) > 0:
            #get output from the network, a vector of predicted value
            output = sae(input)
            #do not compute gradient with respect to target
            target.require_grad = False
            #don't account the output whose initial input is 0
            output[target == 0] = 0
            loss = criterion(output, target)
            #make demonitor is not zero, to add a small number
            mean_corrector = nb_movies / float(torch.sum(target.data>0) + 1e-10)
            #backward method to determine which direction 
            loss.backward()
            #access the data of loss object .data[0]
            #adjust the loss to compute relevant mean for all movies for current user
            train_loss += np.sqrt(loss.data * mean_corrector)
            s += 1.
            #apply optimizer to update weights, decides the amount of weight udpates
            optimizer.step()
    print('epoch: '+str(epoch) + ' loss: ' + str(train_loss/s))


epoch: 1 loss: tensor(1.7716)
epoch: 2 loss: tensor(1.0966)
epoch: 3 loss: tensor(1.0534)
epoch: 4 loss: tensor(1.0385)
epoch: 5 loss: tensor(1.0308)
epoch: 6 loss: tensor(1.0266)
epoch: 7 loss: tensor(1.0238)
epoch: 8 loss: tensor(1.0221)
epoch: 9 loss: tensor(1.0207)
epoch: 10 loss: tensor(1.0198)
epoch: 11 loss: tensor(1.0190)
epoch: 12 loss: tensor(1.0186)
epoch: 13 loss: tensor(1.0179)
epoch: 14 loss: tensor(1.0174)
epoch: 15 loss: tensor(1.0174)
epoch: 16 loss: tensor(1.0170)
epoch: 17 loss: tensor(1.0168)
epoch: 18 loss: tensor(1.0163)
epoch: 19 loss: tensor(1.0167)
epoch: 20 loss: tensor(1.0161)
epoch: 21 loss: tensor(1.0162)
epoch: 22 loss: tensor(1.0160)
epoch: 23 loss: tensor(1.0159)
epoch: 24 loss: tensor(1.0158)
epoch: 25 loss: tensor(1.0155)
epoch: 26 loss: tensor(1.0158)
epoch: 27 loss: tensor(1.0156)
epoch: 28 loss: tensor(1.0150)
epoch: 29 loss: tensor(1.0127)
epoch: 30 loss: tensor(1.0114)
epoch: 31 loss: tensor(1.0103)
epoch: 32 loss: tensor(1.0074)
epoch: 33 loss: t

## test RBM   

In [19]:
#loop through each users
test_loss = 0
for id_user in range(nb_users):
    #keep using training set
    input = Variable(training_set[id_user]).unsqueeze(0)
    #create target by copying input
    target = Variable(test_set[id_user]).unsqueeze(0)
    #only look at users who rated at least 1 movie
    if torch.sum(target.data) > 0:
        #get output from the network, a vector of predicted value
        output = sae(input)
        #do not compute gradient with respect to target
        target.require_grad = False
        #don't account the output whose initial input is 0
        output[target == 0] = 0
        loss = criterion(output, target)
        #make demonitor is not zero, to add a small number
        mean_corrector = nb_movies / float(torch.sum(target.data>0) + 1e-10)
        
        #access the data of loss object .data[0]
        #adjust the loss to compute relevant mean for all movies for current user
        test_loss += np.sqrt(loss.data * mean_corrector)
        s += 1.
print('loss: ' + str(train_loss/s))

loss: tensor(0.6155)


### below print out real and predicted ratings for a user

In [20]:
user_id = 0
movie_title = movies.iloc[:nb_movies, 1:2]
user_rating = training_set.data.numpy()[user_id, :].reshape(-1,1)
user_target = test_set.data.numpy()[user_id, :].reshape(-1,1)
  
user_input = Variable(training_set[user_id]).unsqueeze(0)
predicted = sae(user_input)
predicted = predicted.data.numpy().reshape(-1,1)
  
result_array = np.hstack([movie_title, user_target, predicted])
result_array = result_array[result_array[:, 1] > 0]
result_df = pd.DataFrame(data=result_array, columns=['Movie', 'Target Rating', 'Predicted'])


In [21]:
result_df

Unnamed: 0,Movie,Target Rating,Predicted
0,GoldenEye (1995),3,4.06925
1,Dracula: Dead and Loving It (1995),5,4.63941
2,Nixon (1995),5,3.90525
3,Sense and Sensibility (1995),3,3.56541
4,Money Train (1995),4,3.52291
5,Assassins (1995),4,4.07243
6,Powder (1995),3,3.80682
7,Now and Then (1995),2,3.50572
8,Dangerous Minds (1995),3,4.01029
9,Wings of Courage (1995),4,3.70793
