In [1]:
from utils import *

# Restricted Bolzmann machines

$\textbf{v,h}\in \{0,1\}^{m+n}$
  
&nbsp; $m$ : number of visible node  
&nbsp; $n$ : number of hidden nodde  
$\textbf{v}$ : visilbe nodes $(v_1,v_2,\cdots,v_m)^T$ ($m \times 1 $ matrix)  
$\textbf{j}$ : hidden nodes $(h_1,h_2,\cdots,h_n)^T$  ($n \times 1 $ matrix)
    
$w_{i,j}, b_j, c_i \in \mathbb{R}$   
$W : n \times m $ matrix  
$b : m \times 1 $ matrix  
$c : n \times 1 $ matrix    
$$E(\textbf{v},\textbf{h})= -\sum_{i=1}^{n}\sum_{j=1}^{m}w_{ij}h_iv_j-\sum_{i=1}^nc_ih_i-\sum_{j=1}^{m}b_jv_j$$
$$
= -\mathbf{w}^TW\mathbf{v}-c^T\mathbf{h}-b^T\mathbf{v}
$$


$$p(\mathbf{v,h}) = \frac{e^{-E(\mathbf{v,h})}}{\sum_{\mathbf{v,h}}e^{-E(\mathbf{v,h})}}$$

# Likelihood
$$\mathcal{L}(\theta|v)=p(\mathbf{v}|\theta)
=\frac{\sum_{\mathbf{h}}e^{-E(\mathbf{v,h})}}{\sum_{\mathbf{v,h}}e^{-E(\mathbf{v,h})}}$$

$$\ln\mathcal{L}(\theta|v)=\ln p(\mathbf{v}|\theta)
=\ln{\sum_{\mathbf{h}}e^{-E(\mathbf{v,h})}}
-\ln{\sum_{\mathbf{v,h}}e^{-E(\mathbf{v,h})}}$$

$$\frac{\partial\ln\mathcal{L}(\theta|v)}{\partial \theta}
=\frac{\partial}{\partial \theta} \Big{(} \ln{\sum_{\mathbf{h}}e^{-E(\mathbf{v,h})}} \Big{)}
-\frac{\partial}{\partial \theta} \Big{(} \ln{\sum_{\mathbf{v,h}}e^{-E(\mathbf{v,h})}} \Big{)}$$

$$=-\frac{1}{\sum_{\mathbf{h}}e^{-E(\mathbf{v,h})}}
\sum_{\mathbf{h}}e^{-E(\mathbf{v,h})}\frac{\partial E(\mathbf{v,h})}{\partial \theta}
+\frac{1}{\sum_{\mathbf{v,h}}e^{-E(\mathbf{v,h})}}
\sum_{\mathbf{v,h}}e^{-E(\mathbf{v,h})}\frac{\partial E(\mathbf{v,h})}{\partial \theta}
$$

$$=-\sum_{\mathbf{h}}p(\mathbf{h}|\mathbf{v})\frac{\partial E(\mathbf{v,h})}{\partial \theta}
+
\sum_{\mathbf{v,h}}p(\mathbf{v},\mathbf{h})\frac{\partial E(\mathbf{v,h})}{\partial \theta}
$$
### note
$$p(\mathbf{h}|\mathbf{v})=\frac{p(\mathbf{v,h})}{p(\mathbf{v})}
=\frac{\frac{1}{Z}e^{-E(\mathbf{v,h})}}{\frac{1}{Z}\sum_{\mathbf{h}}e^{-E(\mathbf{v,h})}}
(Z \textsf{ is nomalizing constant})$$

$$p(\mathbf{h}|\mathbf{v})=\prod_{i=1}^n p(h_i|\mathbf{v})$$

$$p(H_i =1 | \mathbf{v}) = sigmoid(\sum_{j=1}^m w_{ij}v_j + c_i)$$
$$p(V_j =1 | \mathbf{h}) = sigmoid(\sum_{j=1}^n w_{ij}v_j + b_j)$$ 

### Gradient of Liklihood
$\theta = w_{ij},b_{i},c_{i}$
$$\frac{\partial\ln\mathcal{L}(\theta|v)}{\partial w_{ij}}=-\sum_{\mathbf{h}}p(\mathbf{h}|\mathbf{v})\frac{\partial E(\mathbf{v,h})}{\partial w_{ij}}
+
\sum_{\mathbf{v,h}}p(\mathbf{v},\mathbf{h})\frac{\partial E(\mathbf{v,h})}{\partial w_{ij}}
$$

$$=\sum_{\mathbf{h}}p(\mathbf{h}|\mathbf{v})h_iv_j
+\sum_{\mathbf{v,h}}p(\mathbf{v},\mathbf{h})h_iv_j
$$

In [2]:
class RBM:
    def __init__(self, data_num = 2, m=10, n=5, k = 100, learning_rate = 1):
        '''
            m : number of visible nodes
            n : number of hidden nodes
        '''
        self.visible_node = m
        self.hidden_node = n
        self.k = k
        self.data_num = data_num
        self.learning_rate= learning_rate
        
        self.W = tf.Variable(initialize_variable([self.hidden_node, self.visible_node], Type = 'uniform'), name = 'weights')
        self.b = tf.Variable(initialize_variable([self.visible_node], Type = 'uniform'), name = 'visible_biases')
        self.c = tf.Variable(initialize_variable([self.hidden_node], Type = 'uniform'), name = 'hidden_biases')
        
        self.visible = tf.placeholder(tf.float32, [self.data_num, self.visible_node])
        visible = iteration(self.visible, weights= self.W, hidden_biases=self.c, visible_biases=self.b, k=self.k)

        
        self.grad_W = tf.matmul(tf.transpose(linear(self.visible, weights = tf.transpose(self.W, [1,0]), biases = self.c), [1,0])
                                ,self.visible)-\
                    tf.matmul(tf.transpose(linear(visible, weights = tf.transpose(self.W, [1,0]), biases = self.c), [1,0])
                                ,visible)
        self.grad_b = tf.reduce_sum(self.visible-visible,[0])
        self.grad_c = tf.reduce_sum(linear(self.visible, weights = tf.transpose(self.W, [1,0]), biases = self.c)
                        -linear(visible, weights = tf.transpose(self.W, [1,0]), biases = self.c),[0])
        print(self.grad_W)
        print(self.grad_c)
        print(self.grad_b)
        self.update_W = tf.assign(self.W, self.W + learning_rate*self.grad_W)
        self.update_b = tf.assign(self.b, self.b + learning_rate*self.grad_b)
        self.update_c = tf.assign(self.c, self.c + learning_rate*self.grad_c)
        self.update = [self.update_W, self.update_b, self.update_c]
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def visible_to_hidden(self, visible):
        return self.sess.run(tf.floor(linear(visible, weights = tf.transpose(self.W, [1,0]), biases = self.c)+0.5))    
    
    def train(self, train_data, train_steps = 100):
        self.sess.run(self.update, feed_dict = {self.visible : train_data})

    def print_tensors(self):
        print_tensor(self.sess, self.W)
        print_tensor(self.sess, self.b)
        print_tensor(self.sess, self.c)
    
    def get_free_energy(self, v, h):
        '''
            input
                v : 1D tensor m
                h : 1D tensor n
            return
                free energy
        '''
        if h.get_shape().ndims!=1 or v.get_shape().ndims!=1:
            raise ValueError("Dimension should be 1 but dimension h : {} and v : {}"
                                .format(h.get_shape().ndims, v.get_shape().ndims))
            
        if h.get_shape()[0]!=self.W.get_shape()[0] or v.get_shape()[0]!=self.W.get_shape()[1]:
            raise ValueError("Size note matches with variables")
        
        E1 = tf.matmul(tf.reshape(h, [1, -1]),self.W)
        E1 = tf.matmul(E1,tf.reshape(v, [-1, 1]))
        E1 = tf.reshape(E1, [1])
        E2 = tf.reshape(tf.matmul(tf.reshape(self.b, [1, -1]), tf.reshape(v,[-1, 1])), [1])
        E3 = tf.reshape(tf.matmul(tf.reshape(self.c, [1, -1]), tf.reshape(h,[-1, 1])), [1])
        energy = -E1-E2-E3
        print_tensor(self.sess, -E1-E2-E3)
        return energy


In [3]:
r = RBM(data_num =  2, m = 3, n = 2)
r.print_tensors()
a = [[1,0,1],[0,1,0]]
a = np.array(a,dtype = np.float32)
r.train(a, train_steps= 10000000)
r.print_tensors()
print(r.visible_to_hidden(a))

Tensor("sub_200:0", shape=(2, 3), dtype=float32)
Tensor("Sum_1:0", shape=(2,), dtype=float32)
Tensor("Sum:0", shape=(3,), dtype=float32)
weights:0
(2, 3)
[[-0.16224408  0.65379262  0.49336648]
 [-0.88399434  0.72778988  0.14434838]]
visible_biases:0
(3,)
[-0.14806104 -0.14889407 -0.93704939]
hidden_biases:0
(2,)
[-0.58097386 -0.61491609]
weights:0
(2, 3)
[[ 0.27561599  1.1719892   0.93122655]
 [-0.67886877  1.25597835  0.34947392]]
visible_biases:0
(3,)
[ 0.85193896  0.85110593  0.06295061]
hidden_biases:0
(2,)
[-0.34233421 -0.58347917]
[[ 1.  0.]
 [ 1.  1.]]
