## Importing the libraries

In [23]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## Reading the dataset

In [24]:
movies=pd.read_csv('ml-1m/movies.dat',sep='::',header=None,engine='python',encoding='latin-1')
users=pd.read_csv('ml-1m/users.dat',sep='::',header=None,engine='python',encoding='latin-1')
ratings=pd.read_csv('ml-1m/ratings.dat',sep='::',header=None,engine='python',encoding='latin-1')
display(movies)
display(users)
display(ratings)

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


## Preparing the training set and the test set

In [25]:
training_set=pd.read_csv('ml-100k/u1.base',delimiter='\t')
training_set=np.array(training_set,dtype='int')
test_set=pd.read_csv('ml-100k/u1.test',delimiter='\t')
test_set=np.array(test_set,dtype='int')
display(pd.DataFrame(training_set))
display(pd.DataFrame(test_set))

Unnamed: 0,0,1,2,3
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561
...,...,...,...,...
79994,943,1067,2,875501756
79995,943,1074,4,888640250
79996,943,1188,3,888640250
79997,943,1228,3,888640275


Unnamed: 0,0,1,2,3
0,1,10,3,875693118
1,1,12,5,878542960
2,1,14,5,874965706
3,1,17,3,875073198
4,1,20,4,887431883
...,...,...,...,...
19994,458,648,4,886395899
19995,458,1101,4,886397931
19996,459,934,3,879563639
19997,460,10,3,882912371


## getting the number of users and movies 

In [26]:
nb_users=int(max(max(training_set[:,0]),max(test_set[:,0])));
nb_movies=int(max(max(training_set[:,1]),max(test_set[:,1])))
print(nb_users,nb_movies)

943 1682


## Converting the data to users in lines and movies in column

In [27]:
def convert(data):
    # we wil create a list of lists cause we are gonna use torch afterwards 
    new_data=[]
    for id_users in range(1,nb_users+1): # the last user is excluded so we add one 
        id_movies=data[:,1][data[:,0]==id_users] # based on condition
        id_ratings=data[:,2][data[:,0]==id_users] 
        ratings=np.zeros(nb_movies)
        ratings[id_movies-1]=id_ratings
        new_data.append(list(ratings))
    return new_data

training_set=convert(training_set)
test_set=convert(test_set)   
display(pd.DataFrame(training_set))
display(pd.DataFrame(test_set))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Converting the data into torch tensors 

In [28]:
training_set=torch.FloatTensor(training_set)
test_set=torch.FloatTensor(test_set)

## Converting the ratings into binary ratings 1 (liked) 0 (NOt liked)

In [29]:
training_set[training_set==0]=-1  # all zeros are unrated movies
training_set[training_set==1]=0    # or operation is different for torch tensors
training_set[training_set==2]=0
training_set[training_set>=3]=1
test_set[test_set==0]=-1  # all zeros are unrated movies
test_set[test_set==1]=0    # or operation is different for torch tensors
test_set[test_set==2]=0
test_set[test_set>=3]=1

## Creating the architecture of the neural network

In [30]:
class RBM():
     def __init__(self,nv,nh,):# number of visible,number hidden nodes
         self.W = torch.randn(nh,nv)      # to intialize a variable belongs to the object
         # the weights of szie nh , nv
         self.a = torch.randn(1,nh) # one bias for each hidden node additional dim for the batch
                           # batch,bias
         self.b = torch.randn(1,nv) # the bias for the visible nodes
     
     def sample_h(self,x): # the visible neurenos x 
         wx=torch.mm(x,self.W.t()) # product of two tensors x and w transpose
         activation=wx+self.a.expand_as(wx)
         p_h_given_v=torch.sigmoid(activation)
         return p_h_given_v,torch.bernoulli(p_h_given_v)
         
     def sample_v(self,y): # the hidden neurenos y 
         wy=torch.mm(y,self.W) # product of two tensors x and w transpose
         activation=wy+self.b.expand_as(wy) # bias of the visible nodes
         p_v_given_h=torch.sigmoid(activation)
         return p_v_given_h,torch.bernoulli(p_v_given_h)
                   
     def train(self,v0,vk,ph0,phk):
         self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
         self.b+=torch.sum((v0-vk),0)
         self.a+=torch.sum((ph0-phk),0)
         

nv=len(training_set[0])
nh=100 
batch_size=100
rbm=RBM(nv,nh)         

## Training the RBM

In [31]:
nb_epoch=10
for epoch in range(1,nb_epoch+1):
    train_loss=0
    s=0. # float type cause of the dot
    for id_user in range(0,nb_users-batch_size,batch_size): # go from 0 to 99 from 100 to 199 the last parameter is the looping step
        vk=training_set[id_user:id_user+batch_size]
        v0=training_set[id_user:id_user+batch_size]
        ph0,_=rbm.sample_h(v0)  # ph0,_ is to get the first return only not two elments               
        
        for k in range(10): # gibbs sampling 
            _,hk=rbm.sample_h(vk)
            _,vk=rbm.sample_v(hk)
            vk[v0<0]=v0[v0<0]      # the unrated movies (-1) should remain like this 
        
        phk,_=rbm.sample_h(vk)
        rbm.train(v0,vk,ph0,phk)
        train_loss+=torch.mean(torch.abs(v0[v0>=0]-vk[v0>=0]))
        s+=1.
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))    

epoch: 1 loss: tensor(0.3365)
epoch: 2 loss: tensor(0.2517)
epoch: 3 loss: tensor(0.2443)
epoch: 4 loss: tensor(0.2522)
epoch: 5 loss: tensor(0.2454)
epoch: 6 loss: tensor(0.2487)
epoch: 7 loss: tensor(0.2456)
epoch: 8 loss: tensor(0.2482)
epoch: 9 loss: tensor(0.2483)
epoch: 10 loss: tensor(0.2486)


## Testing the RBM

In [32]:
test_loss=0
s=0. # float type cause of the dot
for id_user in range(nb_users): # go from 0 to 99 from 100 to 199 the last parameter is the looping step
    v=training_set[id_user:id_user+1]
    vt=test_set[id_user:id_user+1]
   
    if len(vt[vt>=0])>0:
        _,h=rbm.sample_h(v)
        _,v=rbm.sample_v(h)
        test_loss+=torch.mean(torch.abs(vt[vt>=0]-v[vt>=0]))
        s+=1.
print('test loss: '+str(test_loss/s))

test loss: tensor(0.2454)
