In [1]:
import numpy as np
from scipy.io import loadmat
from scipy.optimize import minimize
# np.set_printoptions(threshold=np.inf)

In [2]:
def preprocess():
    """ 
     Input:
     Although this function doesn't have any input, you are required to load
     the MNIST data set from file 'mnist_all.mat'.

     Output:
     train_data: matrix of training set. Each row of train_data contains 
       feature vector of a image
     train_label: vector of label corresponding to each image in the training
       set
     validation_data: matrix of training set. Each row of validation_data 
       contains feature vector of a image
     validation_label: vector of label corresponding to each image in the 
       training set
     test_data: matrix of training set. Each row of test_data contains 
       feature vector of a image
     test_label: vector of label corresponding to each image in the testing
       set
    """

    mat = loadmat('mnist_all.mat')  # loads the MAT object as a Dictionary

    n_feature = mat.get("train1").shape[1]
    n_sample = 0
    for i in range(10):
        n_sample = n_sample + mat.get("train" + str(i)).shape[0]
    n_validation = 1000
    n_train = n_sample - 10 * n_validation

    # Construct validation data
    validation_data = np.zeros((10 * n_validation, n_feature))
    for i in range(10):
        validation_data[i * n_validation:(i + 1) * n_validation, :] = mat.get("train" + str(i))[0:n_validation, :]

    # Construct validation label
    validation_label = np.ones((10 * n_validation, 1))
    for i in range(10):
        validation_label[i * n_validation:(i + 1) * n_validation, :] = i * np.ones((n_validation, 1))

    # Construct training data and label
    train_data = np.zeros((n_train, n_feature))
    train_label = np.zeros((n_train, 1))
    temp = 0
    for i in range(10):
        size_i = mat.get("train" + str(i)).shape[0]
        train_data[temp:temp + size_i - n_validation, :] = mat.get("train" + str(i))[n_validation:size_i, :]
        train_label[temp:temp + size_i - n_validation, :] = i * np.ones((size_i - n_validation, 1))
        temp = temp + size_i - n_validation

    # Construct test data and label
    n_test = 0
    for i in range(10):
        n_test = n_test + mat.get("test" + str(i)).shape[0]
    test_data = np.zeros((n_test, n_feature))
    test_label = np.zeros((n_test, 1))
    temp = 0
    for i in range(10):
        size_i = mat.get("test" + str(i)).shape[0]
        test_data[temp:temp + size_i, :] = mat.get("test" + str(i))
        test_label[temp:temp + size_i, :] = i * np.ones((size_i, 1))
        temp = temp + size_i

    # Delete features which don't provide any useful information for classifiers
    sigma = np.std(train_data, axis=0)
    index = np.array([])
    for i in range(n_feature):
        if (sigma[i] > 0.001):
            index = np.append(index, [i])
    train_data = train_data[:, index.astype(int)]
    validation_data = validation_data[:, index.astype(int)]
    test_data = test_data[:, index.astype(int)]

    # Scale data to 0 and 1
    train_data /= 255.0
    validation_data /= 255.0
    test_data /= 255.0

    return train_data, train_label, validation_data, validation_label, test_data, test_label


In [3]:
def one_of_k(labels,k):
    # inputs : labels : the label vector that needs one of k encoding. dimension : N * 1 
    #          k : in our case k = 10
    
    N = labels.shape[0]

    # create an array of size N * k with all zeros
    result = np.zeros( (N , k) )
    
    # forcing labels to be integer:
    int_labels = labels.astype(int)
    
    row_index = 0
    for index in int_labels:
        result[row_index,index] = 1
        row_index = row_index + 1
    return result

In [4]:
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))


In [5]:
# FOR EXTRA CREDIT ONLY
def mlrObjFunction(params, *args):
    """
    mlrObjFunction computes multi-class Logistic Regression error function and
    its gradient.

    Input:
        initialWeights: the weight vector of size (D + 1) x k
        train_data: the data matrix of size N x D
        labeli: the label vector of size N x 1 where each entry can be either 0 or 1
                representing the label of corresponding feature vector

    Output:
        error: the scalar value of error function of multi-class logistic regression
        error_grad: the vector of size (D+1) x 10 representing the gradient of
                    error function
    """
    # added by : Zulkar
    train_data, Y = args
    
    n_data = train_data.shape[0]
    n_feature = train_data.shape[1]
    error = 0
    error_grad = np.zeros((n_feature + 1, n_class))

    ##################
    # YOUR CODE HERE #
    ##################
    # HINT: Do not forget to add the bias term to your input data
    
    # args gives access to all of these variables - train_data, n_train, n_class
    # args = (train_data, Y)
    # Y = ((n_train, n_class))
    
    
    # code var name  = variable shape
    
    # initialWeights           = (D + 1) x 1   (716,1)
    # initialWeights_transpose =  1 x (D + 1)  (1,716)
    # train_data               =  N x D        (50000,715)
    # train_data_bias          =  N x (D + 1)  (50000,716)
    # w_dot_x                  =               (50000, 1)
    # sigma_w_dot_x            =               (50000, 1)
    # posterior_probability    =               (50000, 1)
    # labeli                   =  N x 1
    # error                    =  1 x 1
    # error_grad               = (D + 1) x 10
    
    # N = 50000
    # K = 
    
    # n_train                  = 50000
    # n_class                  = 10
    
        
# target vector yn
# feature vector xn
# class Ck
# element k, which equals one
# Y is an N × K matrix (obtained using 1-of-K encoding) of target variables with elements ynk
    
#     Please refer to your first assignment. This is same as the 
#     1-of-K encoding that you would have done there. ynk is the value 
#     for the Y[n][k] entry in the matrix.
    
    # added by zulkar:
    
    W = params.reshape((n_feature+1,10))
    print("W:",W)
    # Formula 5 : Posterior_Probabilities
    #print("W: ", W)
    
    # added by: zulkar
    # I think bias should be added at the beginning of the array.
    # train_data_bias = np.insert(train_data, 715, 1, axis = 1)               #(50000,716)
    train_data_bias = np.insert(train_data, 0, 1, axis = 1)               #(50000,716)
    
    #print("train_data_bias: ", train_data_bias)
    w_dot_x = np.dot(train_data_bias,W) #wTx                 #(50000,10)
    #print("w_dot_x: ", w_dot_x)
    
    # I think we have to use exponential function 
    #sigmoid_w_dot_x = sigmoid(w_dot_x) #exp(wTx)                            #(50000,10)
    exp_w_dot_x = np.exp(w_dot_x) #exp(wTx)                            #(50000,10)
    #print("sigmoid_w_dot_x: ", sigmoid_w_dot_x)
    print("exp_w_dot_x: ", exp_w_dot_x)
    

    #sum_sigmoid_w_dot_x = np.sum(sigmoid_w_dot_x,axis = 1) #sum(exp(wTx))   #(50000,1)
    sum_exp_w_dot_x = np.sum(exp_w_dot_x,axis = 1) #sum(exp(wTx))   #(50000,1)
    
    # print("sum_sigmoid_w_dot_x: ", sum_sigmoid_w_dot_x)
    print("sum_exp_w_dot_x: ", sum_exp_w_dot_x)
    
    #inv_sum_sigmoid_w_dot_x = 1.0 / sum_sigmoid_w_dot_x                     #(50000,1)
    inv_sum_exp_w_dot_x = 1.0 / sum_exp_w_dot_x                     #(50000,1)
    #print("inv_sum_sigmoid_w_dot_x: ", inv_sum_sigmoid_w_dot_x)
    print("inv_sum_exp_w_dot_x: ", inv_sum_exp_w_dot_x)
    
    #posterior_probability = np.zeros((sigmoid_w_dot_x.shape[0], sigmoid_w_dot_x.shape[1]))
    theta_nk = np.zeros((exp_w_dot_x.shape[0], exp_w_dot_x.shape[1]))
    
    #for i in range(sigmoid_w_dot_x.shape[0]):                               #50000
    for i in range(exp_w_dot_x.shape[0]):                                    #50000
        #for k in range(sigmoid_w_dot_x.shape[1]):                           #10
        for k in range(exp_w_dot_x.shape[1]):                                #10
            #posterior_probability[i][k] = sigmoid_w_dot_x[i][k] * inv_sum_sigmoid_w_dot_x[i] #(50000,10)
            theta_nk[i][k] = exp_w_dot_x[i][k] * inv_sum_exp_w_dot_x[i] #(50000,10)
    #print("posterior_probability:", posterior_probability)        
    print("posterior_probability: theta_nk ", theta_nk)
    
    # commented by : zulkar
    # I think we don't need to compute likelihood 
    # because we can directly compute error with log-likelihood from equation (7)
    """
    # Formula 6 : likelihood
    y_nk = one_of_k(posterior_probability,n_class)            #(50000,10)
    print("y_nk: ", y_nk)
    power_pp_ynk = np.power(posterior_probability,y_nk)       #(50000,10)
    print("power_pp_ynk: ", power_pp_ynk)
    k_product = np.prod(power_pp_ynk,axis = 1)                #(50000,1)
    print("k_product: ", k_product)
    likelihood = np.prod(k_product,axis = 0)                  #scalar
    print("likelihood: ", likelihood)
    
    
    Formula 7 : log likelihood (error)
    error = -1.0 * np.log(likelihood)                         #scalar
    print("error: ", error)
    """
    
    # added by : Zulkar
    # Formula 7 : log likelihood (error)
    Y_nk = one_of_k(Y,n_class)  # dim: N * k
    # theta_nk : dim : N * k
    ln_theta_nk = np.log(theta_nk)
    product_Y_nk_theta_nk = Y_nk * ln_theta_nk
    sumK_product_Y_nk_theta_nk = np.sum(product_Y_nk_theta_nk, axis=1)  #dim: N * 1
    sumN_sumK_product_Y_nk_theta_nk = np.sum(sumK_product_Y_nk_theta_nk, axis=0)
    error = (-1.0/n_data) * sumN_sumK_product_Y_nk_theta_nk
    print("error",error)
    
    
    # Formula 8 : gradient of error function
    
    #difference = posterior_probability - y_nk;              #(50000,10)
    difference = theta_nk - Y_nk;              #(50000,10)
    transpose_train_data_bias = np.transpose(train_data_bias)         #(10,50000)
    product = np.dot(transpose_train_data_bias,difference)  #(716,10)
    # dot product takes care of sum 
    #error_grad_temp1 = np.sum(product) #scalar
    error_grad_temp2 = (1.0/n_data) * product
    error_grad = error_grad_temp2.flatten()
    print("error grad:",error_grad.shape)
    # Formula 9 : (not needed or used in the assignment)
    
    

#     print(gradient_error_function.shape)
#     print(gradient_error_function)

    return error, error_grad

In [6]:
"""
Script for Logistic Regression
"""
train_data, train_label, validation_data, validation_label, test_data, test_label = preprocess()

# number of classes
n_class = 10

# number of training samples
n_train = train_data.shape[0]

# number of features
n_feature = train_data.shape[1]

Y = np.zeros((n_train, n_class))
for i in range(n_class):
    Y[:, i] = (train_label == i).astype(int).ravel()

# FOR EXTRA CREDIT ONLY
W_b = np.zeros((n_feature + 1, n_class))
initialWeights_b = np.zeros((n_feature + 1, n_class))
opts_b = {'maxiter': 100}

args_b = (train_data, Y)
nn_params = minimize(mlrObjFunction, initialWeights_b, jac=True, args=args_b, method='CG', options=opts_b)
W_b = nn_params.x.reshape((n_feature + 1, n_class))





W: [[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
exp_w_dot_x:  (50000, 10)
inv_sum_exp_w_dot_x:  [ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
posterior_probability: theta_nk  [[ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
 ..., 
 [ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]]
error 4.60517018599
error grad: (7160,)
W: [[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
exp_w_dot_x:  (50000, 10)
inv_sum_exp_w_dot_x:  [ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
posterior_probability: theta_nk  [[ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
 [ 0.1  0.1  0.1 ...,  0.1  0.1  0.1]
 [ 0.1



exp_w_dot_x:  (50000, 10)
inv_sum_exp_w_dot_x:  [  1.50683649e-230   0.00000000e+000   2.66963937e-267 ...,
   6.68857149e-264   1.46404772e-229   2.24598894e-208]
posterior_probability: theta_nk  [[  5.00000000e-001   5.00000000e-001   4.73961947e-256 ...,
    4.73961947e-256   4.73961947e-256   4.73961947e-256]
 [              nan               nan   0.00000000e+000 ...,
    0.00000000e+000   0.00000000e+000   0.00000000e+000]
 [  5.00000000e-001   5.00000000e-001   6.92815077e-297 ...,
    6.92815077e-297   6.92815077e-297   6.92815077e-297]
 ..., 
 [  5.00000000e-001   5.00000000e-001   4.14144145e-293 ...,
    4.14144145e-293   4.14144145e-293   4.14144145e-293]
 [  5.00000000e-001   5.00000000e-001   5.92861969e-255 ...,
    5.92861969e-255   5.92861969e-255   5.92861969e-255]
 [  5.00000000e-001   5.00000000e-001   2.05489469e-231 ...,
    2.05489469e-231   2.05489469e-231   2.05489469e-231]]
error nan




error grad: (7160,)
W: [[  4.50000000e+01   4.50000000e+01  -5.00000000e+00 ...,  -5.00000000e+00
   -5.00000000e+00  -5.00000000e+00]
 [  4.44705882e-04   4.44705882e-04  -4.94117647e-05 ...,  -4.94117647e-05
   -4.94117647e-05  -4.94117647e-05]
 [  1.65882353e-03   1.65882353e-03  -1.84313725e-04 ...,  -1.84313725e-04
   -1.84313725e-04  -1.84313725e-04]
 ..., 
 [  2.86588235e-03   2.86588235e-03  -3.18431373e-04 ...,  -3.18431373e-04
   -3.18431373e-04  -3.18431373e-04]
 [  2.01176471e-03   2.01176471e-03  -2.23529412e-04 ...,  -2.23529412e-04
   -2.23529412e-04  -2.23529412e-04]
 [  4.23529412e-04   4.23529412e-04  -4.70588235e-05 ...,  -4.70588235e-05
   -4.70588235e-05  -4.70588235e-05]]
exp_w_dot_x:  (50000, 10)
inv_sum_exp_w_dot_x:  [ 0.  0.  0. ...,  0.  0.  0.]
posterior_probability: theta_nk  [[ nan  nan   0. ...,   0.   0.   0.]
 [ nan  nan   0. ...,   0.   0.   0.]
 [ nan  nan   0. ...,   0.   0.   0.]
 ..., 
 [ nan  nan   0. ...,   0.   0.   0.]
 [ nan  nan   0. ...,   0.

In [None]:


def mlrPredict(W, data):
    """
     mlrObjFunction predicts the label of data given the data and parameter W
     of Logistic Regression

     Input:
         W: the matrix of weight of size (D + 1) x 10. Each column is the weight
         vector of a Logistic Regression classifier.
         X: the data matrix of size N x D

     Output:
         label: vector of size N x 1 representing the predicted label of
         corresponding feature vector given in data matrix

    """
    label = np.zeros((data.shape[0], 1))   #(50000, 1) 
    
    # data - train_data or validation_data or test_data = N x D
    
    #add bias to data
    #multiply W x data
    #take the highest value of the 10 for each of the 50000 entries and return that index

    # W        (716, 10)
    # data     (50000, 715)
    N = data.shape[0]
    # added by : Zulkar
    # bias should be added in the beginning of the vectors
    # data_bias = np.insert(data, 715, 1, axis = 1)    #(50000, 716)
    data_bias = np.insert(data, 0, 1, axis = 1)    #(50000, 716)
    dot_product = np.dot(data_bias,W)                      #(50000, 10)
     
    # commented by : zulkar 
    """
    for i in range(dot_product.shape[0]):        #50000
        max_value = 0.0
        max_index = 0.0
        
        for k in range(dot_product.shape[1]):    #10
            if dot_product[i][k] > max_value:
                max_value = dot_product[i][k]
                max_index = k
                
        label[i] = max_index
    """
    label_temp = np.argmax(dot_product, axis = 1)
    label = np.reshape(label_temp, (N,1) )
    
#     print(data_bias.shape)
#     print(data_bias)

    return label

In [None]:
# Find the accuracy on Training Dataset
predicted_label_b = mlrPredict(W_b, train_data)
print('\n Training set Accuracy:' + str(100 * np.mean((predicted_label_b == train_label).astype(float))) + '%')

# Find the accuracy on Validation Dataset
predicted_label_b = mlrPredict(W_b, validation_data)
print('\n Validation set Accuracy:' + str(100 * np.mean((predicted_label_b == validation_label).astype(float))) + '%')

# Find the accuracy on Testing Dataset
predicted_label_b = mlrPredict(W_b, test_data)
print('\n Testing set Accuracy:' + str(100 * np.mean((predicted_label_b == test_label).astype(float))) + '%')



In [None]:
a = np.array([[1,2],[3,4],[5,6]])
print(a.flatten())
c = np.sum(a, axis=1)
print(c)