In [1]:
import numpy as np
from scipy.io import loadmat
from scipy.optimize import minimize

In [2]:
def preprocess():
    """ 
     Input:
     Although this function doesn't have any input, you are required to load
     the MNIST data set from file 'mnist_all.mat'.

     Output:
     train_data: matrix of training set. Each row of train_data contains 
       feature vector of a image
     train_label: vector of label corresponding to each image in the training
       set
     validation_data: matrix of training set. Each row of validation_data 
       contains feature vector of a image
     validation_label: vector of label corresponding to each image in the 
       training set
     test_data: matrix of training set. Each row of test_data contains 
       feature vector of a image
     test_label: vector of label corresponding to each image in the testing
       set
    """

    mat = loadmat('mnist_all.mat')  # loads the MAT object as a Dictionary

    n_feature = mat.get("train1").shape[1]
    n_sample = 0
    for i in range(10):
        n_sample = n_sample + mat.get("train" + str(i)).shape[0]
    n_validation = 1000
    n_train = n_sample - 10 * n_validation

    # Construct validation data
    validation_data = np.zeros((10 * n_validation, n_feature))
    for i in range(10):
        validation_data[i * n_validation:(i + 1) * n_validation, :] = mat.get("train" + str(i))[0:n_validation, :]

    # Construct validation label
    validation_label = np.ones((10 * n_validation, 1))
    for i in range(10):
        validation_label[i * n_validation:(i + 1) * n_validation, :] = i * np.ones((n_validation, 1))

    # Construct training data and label
    train_data = np.zeros((n_train, n_feature))
    train_label = np.zeros((n_train, 1))
    temp = 0
    for i in range(10):
        size_i = mat.get("train" + str(i)).shape[0]
        train_data[temp:temp + size_i - n_validation, :] = mat.get("train" + str(i))[n_validation:size_i, :]
        train_label[temp:temp + size_i - n_validation, :] = i * np.ones((size_i - n_validation, 1))
        temp = temp + size_i - n_validation

    # Construct test data and label
    n_test = 0
    for i in range(10):
        n_test = n_test + mat.get("test" + str(i)).shape[0]
    test_data = np.zeros((n_test, n_feature))
    test_label = np.zeros((n_test, 1))
    temp = 0
    for i in range(10):
        size_i = mat.get("test" + str(i)).shape[0]
        test_data[temp:temp + size_i, :] = mat.get("test" + str(i))
        test_label[temp:temp + size_i, :] = i * np.ones((size_i, 1))
        temp = temp + size_i

    # Delete features which don't provide any useful information for classifiers
    sigma = np.std(train_data, axis=0)
    index = np.array([])
    for i in range(n_feature):
        if (sigma[i] > 0.001):
            index = np.append(index, [i])
    train_data = train_data[:, index.astype(int)]
    validation_data = validation_data[:, index.astype(int)]
    test_data = test_data[:, index.astype(int)]

    # Scale data to 0 and 1
    train_data /= 255.0
    validation_data /= 255.0
    test_data /= 255.0

    return train_data, train_label, validation_data, validation_label, test_data, test_label


In [3]:
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))


In [4]:
def blrObjFunction(initialWeights, *args):
    """
    blrObjFunction computes 2-class Logistic Regression error function and
    its gradient.

    Input:
        initialWeights: the weight vector (w_k) of size (D + 1) x 1 
        train_data: the data matrix of size N x D
        labeli: the label vector (y_k) of size N x 1 where each entry can be either 0 or 1 representing the label of corresponding feature vector

    Output: 
        error: the scalar value of error function of 2-class logistic regression
        error_grad: the vector of size (D+1) x 1 representing the gradient of
                    error function
    """
    train_data, labeli = args

    n_data = train_data.shape[0]
    n_features = train_data.shape[1]
    error = 0
    error_grad = np.zeros((n_features + 1, 1))

    ##################
    # YOUR CODE HERE #
    ##################
    # HINT: Do not forget to add the bias term to your input data
    
    
    ################################# Start ######################################
    
    # added by : Zulkar : 4/18/16 2:23 pm 
    # add bias term at the beginning of the feature vector instead of the end. 
    train_data_with_bias = np.ones((n_data , n_features + 1))
    train_data_with_bias[:,1:] = train_data  # dim : N * D+1
    #print("train_data_with_bias:")
    #print(train_data_with_bias.shape)
    
    # compute theta_n = sigma(w.T,x_n)   
    # Since , initialWeights dim = (D+1) * 1
    #          train_data_with_bias dim = N * (D+1)
    # train_data_with_bias . initialWeights will give dim = N * 1
    
    W = initialWeights.reshape((n_feature+1,1))
    theta_n_temp = np.dot(train_data_with_bias,W)  # dim = N * 1
    theta_n = sigmoid(theta_n_temp)
    #print("theta_n:")
    #print (theta_n.shape)
    one_minus_theta_n = 1 - theta_n  # dim : N * 1
    
    ln_theta_n = np.log(theta_n)   # dim : N * 1
    
    ln_one_minus_theta_n = np.log(one_minus_theta_n)  # dim N * 1
    
    y_n = labeli   # dim : N * 1
    
    one_minus_y_n = 1 - labeli   # dim : N * 1
    
    yn_ln_thetan = y_n * ln_theta_n   # dim : N * 1
    
    one_minus_yn_thetan = one_minus_y_n * ln_one_minus_theta_n  # dim : N * 1
    
    add_both_part = yn_ln_thetan + one_minus_yn_thetan  # dim : N * 1
    
    e_w = np.sum(add_both_part)   # scalar
    error = (-1.0 / n_data) * e_w  # scalar
    
    #print (error)
    # added by : Zulkar : 4/18/16 2:23 pm
    ################################## end ###############################################
     
    # added by : Zulkar : 4/24/16 1:35 pm
    ################################## start ###############################################
    theta_n_minus_y_n = theta_n - y_n  # dim : N * 1
    
    # transpose the training data : 
    train_data_with_bias_transpose = np.transpose(train_data_with_bias)   # dim : (D+1) * N
    
    
    sum_theta_n_minus_y_n_into_xn = np.dot(train_data_with_bias_transpose, theta_n_minus_y_n)  #(D+1)*N . N*1
    
    error_grad_temp = (1.0 / n_data) * sum_theta_n_minus_y_n_into_xn
    error_grad = error_grad_temp.flatten()
    #print("error_grad:")
    #print(error_grad.shape)
    # added by : Zulkar : 4/24/16 1:35 pm
    ################################## start ###############################################
    

    return error, error_grad

In [None]:
"""
Script for Logistic Regression
"""
train_data, train_label, validation_data, validation_label, test_data, test_label = preprocess()

# number of classes
n_class = 10

# number of training samples
n_train = train_data.shape[0]

# number of features
n_feature = train_data.shape[1]

Y = np.zeros((n_train, n_class))
for i in range(n_class):
    Y[:, i] = (train_label == i).astype(int).ravel()

# Logistic Regression with Gradient Descent
W = np.zeros((n_feature + 1, n_class))
initialWeights = np.zeros((n_feature + 1, 1))
opts = {'maxiter': 100}
for i in range(n_class):
    labeli = Y[:, i].reshape(n_train, 1)
    args = (train_data, labeli)
    nn_params = minimize(blrObjFunction, initialWeights, jac=True, args=args, method='CG', options=opts)
    W[:, i] = nn_params.x.reshape((n_feature + 1,))

In [16]:
import pickle
print("W:")
print(W)
Wfile = open("blrW.pickle", 'wb')
pickle.dump(W, Wfile)
f.close()

W:
[[ -4.40467828e+00  -6.39131538e-01  -2.96596419e+00 ...,  -9.23410877e-01
   -1.15154127e-01  -4.15241062e+00]
 [ -3.41864545e-05  -1.60701344e-05  -1.11852755e-03 ...,  -1.73259062e-05
   -1.41197296e-06  -4.22150600e-03]
 [ -5.07780459e-04  -5.90879950e-05   8.91514625e-04 ...,  -7.26174889e-05
   -5.26688325e-06  -9.31704968e-03]
 ..., 
 [ -5.61862897e-02  -1.53383276e-04  -7.93344221e-03 ...,   5.89974684e-02
   -9.09938128e-06   6.53657208e-02]
 [ -1.88508187e-01  -1.09719176e-04  -1.43084582e-03 ...,   4.55799235e-01
   -6.38749671e-06  -1.26557049e-03]
 [ -4.36097800e-02  -2.39600344e-05  -3.23087827e-04 ...,   1.07248735e-01
   -1.34473615e-06  -8.78214281e-04]]


In [21]:
def blrPredict(W, data):
    """
     blrObjFunction predicts the label of data given the data and parameter W 
     of Logistic Regression
     
     Input:
         W: the matrix of weight of size (D + 1) x 10. Each column is the weight 
         vector of a Logistic Regression classifier.
         X: the data matrix of size N x D
         
     Output: 
         label: vector of size N x 1 representing the predicted label of 
         corresponding feature vector given in data matrix

    """
    label = np.zeros((data.shape[0], 1))

    ##################
    # YOUR CODE HERE #
    ##################
    # HINT: Do not forget to add the bias term to your input data
    # add bias term at the beginning of the feature vector instead of the end. 
    N = data.shape[0]
    D = data.shape[1]
    data_with_bias = np.ones(( N , D + 1))  # dim : N * (D+1)
    data_with_bias[:,1:] = data  # dim : N * D+1
    
    #data_with_bias_transpose = np.transpose(data_with_bias)  # dim : (D+1) * N
    wT_x = np.dot(data_with_bias , W) # dim :  (D+1) * N . (D + 1) x 10 = (D+1) *  10
    
    sigma_wT_x = sigmoid(wT_x)
    
    label_temp = np.argmax(sigma_wT_x, axis = 1)
    label = np.reshape(label_temp, (N,1) )
    
    return label

In [22]:
# Find the accuracy on Training Dataset
predicted_label = blrPredict(W, train_data)
print('\n Training set Accuracy:' + str(100 * np.mean((predicted_label == train_label).astype(float))) + '%')

# Find the accuracy on Validation Dataset
predicted_label = blrPredict(W, validation_data)
print('\n Validation set Accuracy:' + str(100 * np.mean((predicted_label == validation_label).astype(float))) + '%')

# Find the accuracy on Testing Dataset
predicted_label = blrPredict(W, test_data)
print('\n Testing set Accuracy:' + str(100 * np.mean((predicted_label == test_label).astype(float))) + '%')


 Training set Accuracy:86.222%

 Validation set Accuracy:85.36%

 Testing set Accuracy:85.3%


In [23]:
print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)
print(train_label.shape)

(50000, 715)
(10000, 715)
(10000, 715)
(50000, 1)


In [37]:
def one_of_k(labels,k):
    # inputs : labels : the label vector that needs one of k encoding. dimension : N * 1 
    #          k : in our case k = 10
    
    N = labels.shape[0]

    # create an array of size N * k with all zeros
    result = np.zeros( (N , k) )
    
    # forcing labels to be integer:
    int_labels = labels.astype(int)
    
    row_index = 0
    for index in int_labels:
        result[row_index,index] = 1
        row_index = row_index + 1
    return result

In [39]:
k = 10
result = one_of_k(train_label,k)
print("result:")
print(result[49000:49100,:])

result:
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 

In [None]:
n_data = 3
n_features = 2
train_data = [[1,2],[3,4],[5,6]]
train_data_with_bias = np.ones((n_data , n_features + 1))
train_data_with_bias[:,1:] = train_data
train_data_with_bias = train_data_with_bias - 1
print (train_data_with_bias )

In [None]:
train_data = np.array([[1,2],[3,4],[5,6]])
train_label_temp = np.array([1,2,1])
train_label = np.reshape(train_label_temp,(train_label_temp.shape[0],1))
print(train_data.shape)
#print(validation_data.shape)
#print(test_data.shape)
print(train_label.shape)
ccc = np.argmax(train_data, axis = 1) 
print(ccc)

In [None]:
"""
testing : with small dataset 
Script for Logistic Regression
"""
#train_data, train_label, validation_data, validation_label, test_data, test_label = preprocess()



    
train_data = np.array([[1,2],[3,4],[5,6],[7,8],[9,10]])
train_label_temp = np.array([1,2,1,1,2])
train_label = np.reshape(train_label_temp,(train_label_temp.shape[0],1))

# number of classes
n_class = 2

# number of training samples
n_train = train_data.shape[0]
print ("n_train:")
print (n_train)
# number of features
n_feature = train_data.shape[1]

Y = np.zeros((n_train, n_class))
for i in range(n_class):
    Y[:, i] = (train_label == i).astype(int).ravel()
    
# Logistic Regression with Gradient Descent
W = np.zeros((n_feature + 1, n_class))
initialWeights = np.zeros((n_feature + 1, 1))
opts = {'maxiter': 100}

labeli = Y[:, 1].reshape(n_train, 1)
args = (train_data, labeli)
blrObjFunction(initialWeights, *args)