# Mirko Michele D'Angelo - Assignment 3

First we load the data of the MNIST dataset, both training and test sets.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import idx2numpy
#load training images
tr_images=idx2numpy.convert_from_file('./dataset/train-images.idx3-ubyte')
tr_labels=idx2numpy.convert_from_file('./dataset/train-labels.idx1-ubyte')
#load test images
ts_images=idx2numpy.convert_from_file('./dataset/t10k-images.idx3-ubyte')
ts_labels=idx2numpy.convert_from_file('./dataset/t10k-labels.idx1-ubyte')

# Implementation
First we implement our RBM along the CD-1 training algorithm:
- The __ __init__ __ constructor wll just initialize the values used for the biases and weights, in particulare the weights are initialized according to ... with a gaussian distribution $N(0,0.01)$ the same goes for the hidden biases.
- __sample_hidden__ and __sample_visible__ implement for the corrisponding operation of sampling $h$ given $v$ and sampling $v$ given $h$.
- the __train__ method implements the actual training using the CD-1 algorithm with minibatch and MSE as a loss function to monitor the reconstruction error.
- the __encode__ method allows us to get the hidden activations and use them to encode data
- also a simple sigmoid implementation and a utility sampling method are used to implement the other methods in the class.

In [2]:
class RBM:
    def __init__(self,visible_size,hidden_size):

        self.visible_bias= np.zeros(visible_size,dtype='float64')
        self.hidden_bias= np.zeros(hidden_size,dtype='float64')

        self.weights=np.random.normal(scale=0.01,size=(visible_size,hidden_size))
        print(f"buildinig a RBM with {visible_size} visible units and {hidden_size} hidden units")
    def _sigmoid(self,x):
        return 1/(1+np.exp(-x))
    def _sample(self,prob):
        return (prob > np.random.rand(*prob.shape)).astype(np.float64)
    def sample_hidden(self,v):
        ha_prob= self._sigmoid(v@self.weights+self.hidden_bias)
        ha_states= self._sample(ha_prob)
        return ha_prob,ha_states
    def sample_visible(self,h):
        recon_prob= self._sigmoid(h@self.weights.T+self.visible_bias)
        recon_act= self._sample(recon_prob)
        return recon_prob,recon_act

    def train(self,values,eta=0.01,epochs=100,batch_size=64):
        print(f"training over {values.shape[0]} samples with {values.shape[1]} features \nepochs={epochs}\t batch size={batch_size}\t learning rate={eta}")
        for e in range(epochs):
            for i in range(0,values.shape[0],batch_size):
                # clamp data as input
                #clamped_data= self._sample(values[i:i+batch_size])
                clamped_data= values[i:i+batch_size]
                #sample h given v
                ha_prob,ha_states=self.sample_hidden(clamped_data)
                #calculate wake part
                wake=clamped_data.T@ha_prob
                #sample v given h
                recon_prob,recon_act=self.sample_visible(ha_states)
                active_prob=self._sigmoid(recon_act@self.weights+ self.hidden_bias)
                #calculate dream part
                dream=recon_act.T@active_prob
                delta_w=(wake-dream)/batch_size
                delta_bh = (np.mean(ha_prob-active_prob, axis=0))
                delta_bv = (np.mean(clamped_data-recon_act, axis=0))

                self.weights+=eta*delta_w
                self.hidden_bias+=eta*delta_bh
                self.visible_bias+=eta*delta_bv
            clamped_data= self._sample(values)
            ha_prob,ha_states=self.sample_hidden(clamped_data)
            recon_prob,recon_act=self.sample_visible(ha_states)
            print(f"epoch no.{e+1} reconstruction error: {np.mean((clamped_data-recon_act)**2)}")
    def encode(self,data):
        #sample h given v
        _,ha_states=self.sample_hidden(data)
        print(f"{ha_states.shape[0]} samples encoded with {ha_states.shape[1]} hidden units")
        return ha_states

## RBM training
Now we train the RBM, first the data from mnist dataset is flattened from a $(28 \times 28)$ matrix of integers between 0 and 255 to an array of 768 integers.
After the flattening binarization is applied with a threshold of 127 to get values that are either 0 or 1.

The reason for the flattening is just to be able to feed it into the rbm while the binarization is useful since it allows the contrastive divergence algorithm to properly work.

In [3]:
def flatten_and_binarize(images,threshold=127):
    return (images.reshape((-1,28*28))>threshold).astype(np.float64)

In [4]:
rbm=RBM(28*28,50)

training=flatten_and_binarize(tr_images)
rbm.train(training,
          eta=0.2,
          epochs=10,
          batch_size=64
          )

buildinig a RBM with 784 visible units and 50 hidden units
training over 60000 samples with 784 features 
epochs=10	 batch size=64	 learning rate=0.2
epoch no.1 reconstruction error: 0.09091751700680271
epoch no.2 reconstruction error: 0.08384447278911565
epoch no.3 reconstruction error: 0.08105393282312925
epoch no.4 reconstruction error: 0.07953365221088435
epoch no.5 reconstruction error: 0.07842778486394558
epoch no.6 reconstruction error: 0.07743263180272109
epoch no.7 reconstruction error: 0.07674034863945578
epoch no.8 reconstruction error: 0.07583216411564626
epoch no.9 reconstruction error: 0.07530625
epoch no.10 reconstruction error: 0.07453975340136054


The same treatment also goes for the test set.

In [5]:
test=flatten_and_binarize(ts_images)


## getting binary activations
Using the encode method i can encode both training and test set of images using the hidden states activations.

## classifier performances


In [6]:
h_train=rbm.encode(training)
h_test=rbm.encode(test)

60000 samples encoded with 50 hidden units
10000 samples encoded with 50 hidden units


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
#raw training with values
mlp=LogisticRegression(max_iter=100).fit(training,tr_labels)
pred=mlp.predict(test)
print(confusion_matrix(ts_labels,pred))
print(f"raw {accuracy_score(ts_labels,pred)}")
# training on encoded values
mlp=LogisticRegression(max_iter=100).fit(h_train,tr_labels)
pred=mlp.predict(h_test)
print(confusion_matrix(ts_labels,pred))
print(f"encoded {accuracy_score(ts_labels,pred)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[ 955    0    0    2    0   10    5    6    2    0]
 [   0 1110    4    1    0    4    4    0   12    0]
 [   7    9  919   18   11    2   14    8   41    3]
 [   4    1   16  922    2   25    2   12   18    8]
 [   1    1    7    2  904    1   13    4   10   39]
 [  14    3    5   38   14  761   10    7   32    8]
 [   9    4    6    0    7   20  905    2    5    0]
 [   2    9   23    9    6    2    0  938    3   36]
 [   9   12    6   27    9   32   10   10  848   11]
 [   8    7    1   13   32    7    1   26    6  908]]
raw 0.917
[[ 933    0   11    0    2   11    9    0   13    1]
 [   0 1099    7    5    1    3    5    1   14    0]
 [  16    3  899   24   15    4   19   13   35    4]
 [   5    1   30  868    0   45    4   13   31   13]
 [   1    1   13    2  851    6   16    7   16   69]
 [  13    6    7   64   16  720   17    6   34    9]
 [  19    3    9    3   13   14  886    1    9    1]
 [   2    9   28    5   18    3    1  904    5   53]
 [  16    7    8   26   13   27   1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
from sklearn.svm import LinearSVC
#raw training with values
mlp=LinearSVC().fit(training,tr_labels)
pred=mlp.predict(test)
print(confusion_matrix(ts_labels,pred))
print(f"raw {accuracy_score(ts_labels,pred)}")
# training on encoded values
mlp=LinearSVC().fit(h_train,tr_labels)
pred=mlp.predict(h_test)
print(confusion_matrix(ts_labels,pred))
print(f"encoded {accuracy_score(ts_labels,pred)}")



[[ 957    0    1    1    1    8    6    3    3    0]
 [   0 1109    5    1    0    3    4    0   13    0]
 [  13    9  900   19   11    3   17   11   45    4]
 [   6    2   21  909    4   25    5   14   17    7]
 [   2    2    4    2  910    2   10    3    7   40]
 [  11    3    3   38   12  765   15    8   29    8]
 [  12    4    4    3    6   20  905    0    4    0]
 [   2   13   24    7    6    4    0  939    1   32]
 [  10   14   10   21   13   32   13   12  835   14]
 [   7    8    0   12   42   10    1   32   16  881]]
raw 0.911
[[ 939    0    8    4    3    4    9    0   12    1]
 [   0 1109    7    3    0    1    3    0   12    0]
 [  21    5  882   22   14    6   18   21   39    4]
 [   9    1   36  853    0   42    4   17   31   17]
 [   3    2   14    3  834    6   14    6   16   84]
 [  19   10    9   78   18  683   18    9   38   10]
 [  22    4    7    2   10   13  886    2   10    2]
 [   2   13   28    2   20    4    1  897    7   54]
 [  16    9    8   25   14   22   1