Links
- http://setosa.io/ev/markov-chains/
- https://www.analyticsvidhya.com/blog/2014/07/markov-chain-simplified/
- https://deeplearning4j.org/restrictedboltzmannmachine

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel as parallel
import torch.utils.data
from torch.autograd import Variable


In [3]:
# Importing the movies.dat dataset, user.dat dataset, rating.dat dataset
movies_df = pd.read_csv("../archive/ml-1m/movies.dat", 
                           sep="::", header=None, engine="python", encoding="latin-1")

users_df = pd.read_csv("../archive/ml-1m/users.dat", 
                           sep="::", header=None, engine="python", encoding="latin-1")

ratings_df = pd.read_csv("../archive/ml-1m/ratings.dat", 
                           sep="::", header=None, engine="python", encoding="latin-1")

In [4]:
# Preparing the training set and testing set
training_set = pd.read_csv("../archive/ml-100k/u1.base", delimiter="\t")
training_set_array = np.array(training_set, dtype='int')

testing_set = pd.read_csv("../archive/ml-100k/u1.test", delimiter="\t")
testing_set_array = np.array(testing_set, dtype='int')

In [5]:
training_set.head(5)

Unnamed: 0,1,1.1,5,874965758
0,1,2,3,876893171
1,1,3,4,878542960
2,1,4,3,876893119
3,1,5,3,889751712
4,1,7,4,875071561


In [6]:
testing_set.head(5)

Unnamed: 0,1,6,5,887431973
0,1,10,3,875693118
1,1,12,5,878542960
2,1,14,5,874965706
3,1,17,3,875073198
4,1,20,4,887431883


In [7]:
# We will have to make the training set into a matrix where 
# the rows are the user, columns are the movies, and the values inside are the ratings

# Getting the total number of users and movies
# Column 1 is the user, Column 2 is the movies
nb_users = int(max(max(training_set_array[:, 0]), max(testing_set_array[:, 0])))
nb_movies = int(max(max(training_set_array[:, 1]), max(testing_set_array[:, 1])))

In [8]:
# Converting the data into an array with users in rows and movies in columns
# We will have to create a list with the sublist being every user with info. on their ratings for each movies

def convert(data):
    new_data = []
    for id_users in range(1, nb_users+1):
        id_movies = data[:, 1][data[:,0] == id_users]
        id_ratings = data[:, 2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies) # initiliazing an array of all the ratings that could happen
        ratings[id_movies-1] = id_ratings
        new_data.append(list(ratings))
    return new_data       

In [9]:
# Coverting the arrays into a matrixs where the rows are the users, columns are the movies
# and the values are the ratings

training_set_array = convert(training_set_array)
testing_set_array = convert(testing_set_array)

In [10]:
# Example: 1st row with the first 10 movies

training_set_array[0][0:10]

[0.0, 3.0, 4.0, 3.0, 3.0, 0.0, 4.0, 1.0, 5.0, 0.0]

In [11]:
# Torches are only one type of datatype in a multi-dimensions
# Torches are a better way to create calculation 

training_set_array = torch.FloatTensor(training_set_array)
testing_set_array = torch.FloatTensor(testing_set_array)

In [12]:
training_set_array


    0     3     4  ...      0     0     0
    4     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    5     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     5     0  ...      0     0     0
[torch.FloatTensor of size 943x1682]

In [13]:
# The original dataset is binary (the user either liked the movie or did not)
# As of now, the tensor and arrays are rated from 1 through 5 
# Need to convert the ratings to binary (0 or 1)
# We have to convert the 0's in the array to -1, since the 0's meant that the user did not rate the movie
# However, the 0 will no longer mean that the 0 is non-existenting, rather, that the user did not like the movie
# The OR condition does not work for the torch

training_set_array[training_set_array==0] = -1
training_set_array[training_set_array==1] = 0
training_set_array[training_set_array==2] = 0
training_set_array[training_set_array>=3] = 1


# Following the same procedure for the testing set
testing_set_array[testing_set_array==0] = -1
testing_set_array[testing_set_array==1] = 0
testing_set_array[testing_set_array==2] = 0
testing_set_array[testing_set_array>=3] = 1

In [14]:
# Creating a class for the Boltmann model
# We must initialize the biased and weights randomly
# One biases for each hidden node (we must create the 1 to make it into 2-dimensions)

class RBM():
    def __init__(self, num_hidden, num_vis):
        
        self.weights = torch.randn(num_hidden, num_vis)
        self.h_biases = torch.randn(1, num_hidden)
        self.v_biases = torch.randn(1, num_vis)
        
    # Intuition: prob_hidd_given_vis means the probability that the node is activated
    # the hidden node is that the person liked the movie given some detection from the model 
    def sample_h(self, inputs):
        weight_inputs = torch.mm(inputs, self.weights.t()) # mm is multiplication for torch
        activation = weight_inputs + self.h_biases.expand_as(weight_inputs) # applying the hidden biases to the function
        prob_hidd_given_vis = torch.sigmoid(activation)
        return prob_hidd_given_vis, torch.bernoulli(prob_hidd_given_vis)
        
    def sample_v(self, outputs):
        weight_outputs = torch.mm(outputs, self.weights) # mm is multiplication for torch
        activation = weight_outputs + self.v_biases.expand_as(weight_outputs) # applying the hidden biases to the function
        prob_vis_given_hidden = torch.sigmoid(activation)
        return prob_vis_given_hidden, torch.bernoulli(prob_vis_given_hidden)
    
    def train(self, v0, vk, ph0, phk):
        self.weights += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)
        self.v_biases += torch.sum((v0-vk), 0)
        self.h_biases += torch.sum((ph0-phk), 0)
        
    
        

In [15]:
num_vis = len(training_set_array[0])    
num_hidden = 100
batch_size = 100 # This will be changed (to tune the model)
rbm = RBM(num_hidden, num_vis)


In [16]:
# Training the RBM model
nb_epochs = 10
for epoch in range(1, nb_epochs+1):
    train_loss = 0
    counter = 0.0
    for id_user in range(0, nb_users-batch_size, batch_size):
        vk = training_set_array[id_user:id_user+batch_size]
        v0 = training_set_array[id_user:id_user+batch_size]
        ph0, _ = rbm.sample_h(v0)
        for k in range(10):
            _, hk = rbm.sample_h(vk)
            _, vk = rbm.sample_v(hk)
            vk[v0<0] = v0[v0<0]
        phk, _ = rbm.sample_h(vk)
        rbm.train(v0, vk, ph0, phk)
        train_loss += torch.mean(torch.abs(v0[v0>=0]-vk[v0>=0])) # prediction is vk, only need the variables that are positive
        counter += 1.
    print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss/counter))

  return self.add_(other)


epoch: 1 loss: 0.292782468383304
epoch: 2 loss: 0.2506081905793428
epoch: 3 loss: 0.2510089601145487
epoch: 4 loss: 0.2509116497231665
epoch: 5 loss: 0.2475947649276612
epoch: 6 loss: 0.249144544341306
epoch: 7 loss: 0.2494170116171819
epoch: 8 loss: 0.25170255367605454
epoch: 9 loss: 0.24816046512647277
epoch: 10 loss: 0.250907155826084


In [47]:
# The loss is .25, meaning that we have a .75 correct prediction

In [18]:
# Using the model, trying to predict results for the testing set
# Training the RBM model

# Quick explanation: for every user in the testing set, we capture their visible node
# then, we check their rating for the movies they rated
# We use then sampled them for the hidden nodes
# And also, use those hidden nodes to sample the visible nodes


test_loss = 0
counter = 0.0
for id_user in range(nb_users): 
    v = training_set_array[id_user:id_user+1] # We need to use the training set to make the predictions
    vt = testing_set_array[id_user:id_user+1] # the target variable, original rating of the testing set
    if len(vt[vt>=0] > 0): # All the rating that are exisiting (not less than 0)
        _, h = rbm.sample_h(v)
        _, v = rbm.sample_v(h)

        test_loss += torch.mean(torch.abs(vt[vt>=0]-v[vt>=0])) # prediction is vk, only need the variables that are positive
        counter += 1.
print('loss: ' + str(test_loss/counter))


loss: 0.24028379609841521
