# Lab5

In [91]:
import sklearn.datasets

def load_iris():
    D, L = sklearn.datasets.load_iris()['data'].T, sklearn.datasets.load_iris()['target']
    return D, L


In [92]:
import numpy

def split_db_2to1(D, L, seed=0):
    nTrain = int(D.shape[1]*2.0/3.0)
    numpy.random.seed(seed)
    idx = numpy.random.permutation(D.shape[1])
    idxTrain = idx[0:nTrain]
    idxTest = idx[nTrain:]
    DTR = D[:, idxTrain]
    DTE = D[:, idxTest]
    LTR = L[idxTrain]
    LTE = L[idxTest]
    return (DTR, LTR), (DTE, LTE)


In [93]:
D, L = load_iris()
(DTR, LTR), (DTE, LTE) = split_db_2to1(D, L)

In [94]:
def vcol(vector, shape0):
    # Auxiliary function to transform 1-dim vectors to column vectors.
    return vector.reshape(shape0, 1)


def vrow(vector, shape1):
    # Auxiliary function to transform 1-dim vecotrs to row vectors.
    return vector.reshape(1, shape1)

In [95]:
def computeMLestimates(D, L):
    # Compute classes means over columns of the dataset matrix
    mu0 = D[:, L == 0].mean(axis=1)
    mu1 = D[:, L == 1].mean(axis=1)
    mu2 = D[:, L == 2].mean(axis=1)
    # Reshape all of them as 4x1 column vectors
    mu0 = vcol(mu0, mu0.size)
    mu1 = vcol(mu1, mu1.size)
    mu2 = vcol(mu2, mu2.size)
    # Count number of elements in each class
    n0 = D[:, L == 0].shape[1]
    n1 = D[:, L == 1].shape[1]
    n2 = D[:, L == 2].shape[1]
    # Subtract classes means from classes datasets with broadcasting
    DC0 = D[:, L == 0]-mu0
    DC1 = D[:, L == 1]-mu1
    DC2 = D[:, L == 2]-mu2
    # Compute classes covariance matrices
    sigma0 = (1/n0)*(numpy.dot(DC0, DC0.T))
    sigma1 = (1/n1)*(numpy.dot(DC1, DC1.T))
    sigma2 = (1/n2)*(numpy.dot(DC2, DC2.T))
    return (mu0, sigma0), (mu1, sigma1), (mu2, sigma2)

In [119]:
n0 = DTR[:, LTR == 0].shape[1]
n0


31

In [96]:
(mu0, sigma0), (mu1, sigma1), (mu2, sigma2) = computeMLestimates(DTR, LTR)

In [97]:
def computeScoreMatrix(D, mu0, sigma0, mu1, sigma1, mu2, sigma2, callback):
    S = numpy.array([callback(D, mu0, sigma0), callback(
        D, mu1, sigma1), callback(D, mu2, sigma2)])
    return S

In [98]:
def logpdf_GAU_ND(x, mu, sigma):
    return -(x.shape[0]/2)*numpy.log(2*numpy.pi)-(1/2)*(numpy.linalg.slogdet(sigma)[1])-(1/2)*((numpy.dot((x-mu).T, numpy.linalg.inv(sigma))).T*(x-mu)).sum(axis=0)

In [101]:
#Compute, for each test sample, the MVG log-density.
# We can proceed as seen in lab 04 and we can store class-conditional
# probabilities (the computed log-densities) in a score matrix logS. logS[i, j]
# should be the class conditional probability for sample j given class i.
logS = computeScoreMatrix(DTE, mu0, sigma0, mu1,
                          sigma1, mu2, sigma2, logpdf_GAU_ND)
logS.shape

(3, 50)

In [40]:
#2) Compute the matrix of joint log-distribution probabilities logSJoint for
# samples and classes combining the score matrix with prior information.
# We assume that the three classes have the same
# prior probability P(c) = 1/3. logSJoints requires adding each row of
# logS to the logarithm of the prior probability of the corresponding class.
priorLogProbabilities = vcol(numpy.array([numpy.log(1/3), numpy.log(1/3), numpy.log(1/3)]), 3)
logSJoint = logS + priorLogProbabilities

In [114]:
priorLogProbabilities.shape

(3, 1)

In [62]:
import scipy as sc
marginalLogDensities = vrow(sc.special.logsumexp(logSJoint, axis=0), 50)  # 1x50

In [63]:
# Now we can compute the array of class log-posterior probabilities logSPost.
logSPost = logSJoint - marginalLogDensities

In [64]:
# The predicted label is obtained as the class that has maximum posterior
# probability, in our 3x50 logSPost matrix. This needs to be done for each sample.
# We can use argmax with axis=0 on the logSPost matrix. It will return an
# array whose values are the indices (in our case 0, 1, 2) of the maximum
# values along the specified axis. (So, for us is the maximum of each column)
predictedLabels = logSPost.argmax(axis=0)
print("Predicted labels: ", predictedLabels)

Predicted labels:  [0 0 1 2 2 0 0 0 1 1 0 0 1 0 2 1 2 1 0 2 0 2 0 0 2 0 2 1 1 1 2 2 2 1 0 1 2
 2 0 1 1 2 1 0 0 0 2 1 2 0]


In [75]:
# We can now compute an array of booleans corresponding to whether predicted
# and real labels are equal or not. Then, summing all the elements of a
# boolean array gives the number of elements that are True.
numberOfCorrectPredictions = numpy.array(predictedLabels == LTE).sum()
# Now we can compute percentage values for accuracy and error rate.
accuracy = numberOfCorrectPredictions/LTE.size*100
errorRate = 100-accuracy

In [83]:
print(f"accuracy is:",accuracy)
print(f"errorRate is:",errorRate)


accuracy is: 96.0
errorRate is: 4.0


In [82]:
#test wih the solution
solutionLogMariginal = numpy.load('./solution/logPosterior_MVG.npy')


In [85]:
numpy.unique(L).shape[0]

3

In [113]:
x = 3
my_array = numpy.zeros((3,10))
for i in range(3):
    #for 1D array
    my_array[i] = numpy.array([1,2,3,4,5,6,7,8,9,10])
my_array


array([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
       [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.],
       [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]])