In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn                 # the torch module to implement the Neural Networks
import torch.nn.parallel              # for parallel computations
import torch.optim as optim           # for optimizers
import torch.utils.data               # tools
from torch.autograd import Variable   # for Stochastic Gradient Descent
from rbm_model import RBM

In [2]:
movies = pd.read_csv('data/ml-1m/movies.dat', sep = '::', 
                    header = None, engine = 'python', encoding = 'latin-1')
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
users = pd.read_csv('data/ml-1m/users.dat', sep = '::', 
                    header = None, engine = 'python', encoding = 'latin-1')
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
ratings = pd.read_csv('data/ml-1m/ratings.dat', sep = '::', 
                    header = None, engine = 'python', encoding = 'latin-1')
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
train_set = pd.read_csv('data/ml=100k/u1.base', delimiter = '\t') 
train_set.head()

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561


In [6]:
train_set = np.array(train_set, dtype = 'int')
train_set.dtype

dtype('int64')

In [7]:
test_set = pd.read_csv('data/ml=100k/u1.test', delimiter = '\t') 
test_set = np.array(test_set, dtype = 'int')
test_set.dtype

dtype('int64')

In [8]:
print('There are', train_set[:,0].max(), 'Users in Training Set')
print('There are', train_set[:,1].max(), 'Movies in Training Set')
print('There are', test_set[:,0].max(), 'Users in Testing Set')
print('There are', test_set[:,1].max(), 'Movies in Testing Set')

There are 943 Users in Training Set
There are 1682 Movies in Training Set
There are 462 Users in Testing Set
There are 1591 Movies in Testing Set


In [9]:
t_users = int(max(max(train_set[:,0]), max(test_set[:,0])))
t_movies = int(max(max(train_set[:,1]), max(test_set[:,1])))

print(f'The total number of Users are {t_users} and total number of Movies are {t_movies}')

The total number of Users are 943 and total number of Movies are 1682


In [10]:
def convert(dataset):
    new_data = []                               
    for user_id in range(1, (t_users + 1)):       
        movie_ids = dataset[:,1][dataset[:,0]==user_id]
        rating_ids = dataset[:,2][dataset[:,0]==user_id]
        ratings = np.zeros(t_movies)
        ratings[movie_ids - 1] = rating_ids 
        new_data.append(list(ratings))
    
    return new_data

In [11]:
# applying the function above to training and test set
train_set = convert(train_set)
test_set = convert(test_set)

In [12]:
train_set = torch.FloatTensor(train_set)
test_set = torch.FloatTensor(test_set)

In [13]:
# first we replace all the zeros in train set by -1
# coz all the zeros are the non-existing ratings for a movie by a user
# now the new ratings are going to be 0(liked) and 1(disliked), hence the orignal zeros must now have the new value as -1
# thus, -1 will mean there wasn't a rating for a particular movie by a particular user

train_set[train_set == 0] = -1            # movies not rated will be represented by -1

# now we will change orignal ratings of 1 and 2 to 0, i.e. if the movie is rated 1 or 2, means that user disliked the movie
# also the 'or' logic cannot be used with tensors thus we do the operation for 1 and 2 seperately
train_set[train_set == 1] = 0
train_set[train_set == 2] = 0

# the original ratings greater than 3 will be considered as the user like the movie, hence we represent them by 1
train_set[train_set >= 3] = 1



# doing the same for test set
test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

In [14]:
nv = len(train_set[0])      # no. of visible nodes
nh = 100                    # the features to be detected by RBM, hence can define any relevant number
batch_size = 100
print(nv, nh)
rbm = RBM(nv, nh)

1682 100


In [15]:
nb_epoch = 10       # 10 because as we have a binary outcome and less data, the model will converge quickly

# creating a for loop to iterate through these epochs and in each epoch all observations go in the network 
# and then updating the weights after observations of each batch that passed through the network
# and then we get our final visible nodes with new ratings for the movies that were not orignally rated
for epoch in range(1, nb_epoch+1):
    train_loss = 0                      
    s = 0.                          
    
    for id_user in range(0, t_users - batch_size, batch_size):
        vk = train_set[id_user:id_user+batch_size]
        v0 = train_set[id_user:id_user+batch_size]
        ph0,_ = rbm.sample_h(v0)
        
        for k in range(10):
            _,hk = rbm.sample_h(vk)
            _,vk = rbm.sample_v(hk)
            vk[v0<0] = v0[v0<0] 
    
        phk,_ = rbm.sample_h(vk)
        rbm.train(v0, vk, ph0, phk)
        
        train_loss += torch.mean(torch.abs(v0[v0 >= 0] - vk[v0 >= 0]))
        s += 1.
        
    print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss/s) )

epoch: 1 loss: tensor(0.3404)
epoch: 2 loss: tensor(0.2521)
epoch: 3 loss: tensor(0.2500)
epoch: 4 loss: tensor(0.2437)
epoch: 5 loss: tensor(0.2474)
epoch: 6 loss: tensor(0.2481)
epoch: 7 loss: tensor(0.2499)
epoch: 8 loss: tensor(0.2453)
epoch: 9 loss: tensor(0.2502)
epoch: 10 loss: tensor(0.2453)


In [16]:
model_path = "weight/rbm_weight_model.pth"
# torch.save(rbm, model_path)
model_scripted = torch.jit.script(rbm) # Export to TorchScript
model_scripted.save(model_path) # Save

In [17]:
# for param_tensor in rbm.state_dict():
#     print(param_tensor, "\t", rbm.state_dict()[param_tensor].size())

In [18]:
rbm_new = torch.jit.load(model_path)

In [19]:
test_loss = 0                      
s = 0.                          

for id_user in range(t_users):           
    v = train_set[id_user:id_user+1]      
    vt = test_set[id_user:id_user+1]      
    
    if len(vt[vt>=0]) > 0:           
        _,h = rbm_new.sample_h(v)
        _,v = rbm_new.sample_v(h)
        # print(h, v)
        test_loss += torch.mean(torch.abs(vt[vt >= 0] - v[vt >= 0]))
        s += 1.

    # print(id_user, v, vt)
print('test_loss: ' + str(test_loss/s) )

test_loss: tensor(0.2566)


In [20]:
movies = pd.read_csv('data/ml=100k/u.item', sep = '|', engine = 'python', encoding = 'latin-1', header = None)
movie_title = movies.iloc[:t_movies, 1:2]
movie_title = pd.DataFrame.transpose(movie_title)
movie_title

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
1,Toy Story (1995),GoldenEye (1995),Four Rooms (1995),Get Shorty (1995),Copycat (1995),Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Twelve Monkeys (1995),Babe (1995),Dead Man Walking (1995),Richard III (1995),...,Mirage (1995),Mamma Roma (1962),"Sunchaser, The (1996)","War at Home, The (1996)",Sweet Nothing (1995),Mat' i syn (1997),B. Monkey (1998),Sliding Doors (1998),You So Crazy (1994),Scream of Stone (Schrei aus Stein) (1991)


In [21]:
user_id = 150
user_input = Variable(test_set[user_id - 1]).unsqueeze(0)
user_input

tensor([[ 1., -1., -1.,  ..., -1., -1., -1.]])

In [22]:
output = rbm_new.predict(user_input)
output = output.data.numpy()
output

array([[1., 1., 1., ..., 0., 1., 0.]], dtype=float32)

In [23]:
input_output = np.vstack([movie_title, user_input, output])
input_output = pd.DataFrame(input_output)
input_output = pd.DataFrame.transpose(input_output)
input_output = input_output[input_output.iloc[:, 2] == 1]
input_output = input_output[input_output.iloc[:, 1] == -1]
input_output = input_output.iloc[:, 0:1]
input_output.columns = ['Recommended Movies']
input_output

Unnamed: 0,Recommended Movies
1,GoldenEye (1995)
2,Four Rooms (1995)
3,Get Shorty (1995)
4,Copycat (1995)
6,Twelve Monkeys (1995)
...,...
1673,Mamma Roma (1962)
1674,"Sunchaser, The (1996)"
1676,Sweet Nothing (1995)
1678,B. Monkey (1998)
