In [None]:
'''
Dataset contains of 30,000 training examples, images of 
size 28 * 28. Thus, converting image pixels into matrix of 
dimension: 30,000 * 784.

Split data set in 80:20 ratio for trainging and validation 
respectively.

'''

import h5py
import numpy as np
import matplotlib.pyplot as pl
import datetime
from collections import Counter


# load the training images files
with h5py.File('../Input/images_training.h5', 'r') as H:
    trainingData = np.copy(H['datatrain'])

# load the training labels
with h5py.File('../Input/labels_training.h5', 'r') as H:
    trainingLabel = np.copy(H['labeltrain'])

# load the testing images
with h5py.File('../Input/images_testing.h5', 'r') as H:
    data_test = np.copy(H['datatest'])
    testingData = data_test[:5000]

# load the testing labels
with h5py.File('../Input/labels_testing.h5', 'r') as H:
    label_test = np.copy(H['labeltest'])
    testingLabel = label_test[:5000]
        

In [None]:
'''KNN CLASSIFIER'''

'''
Categorize the data into different classes.
Calculated the similarity measure using Euclidean 
distance, which is defined as the absolute difference
between the coordinates of two data points

'''
class KNN_Classifier():
    def __init__(self):
        pass
    
    # Get training data and labels
    def getData(self, trainingData, trainingLabel):
        self.dataTrain = trainingData
        self.labelTrain = trainingLabel
        
    # Function to predict classes
    def predictionFunction(self, trainingData, totNeighbours = 5):
        totalDistance = self.calculateDistance(trainingData)
        
        totalData = totalDistance.shape[0]
        predictedLabel = np.zeros(totalData)
        
        # Compare distance with neighbours
        for i in range(totalData):
            closestNeighbours = []
            labels = self.labelTrain[np.argsort(totalDistance[i, :])].flatten()
            closestNeighbours = labels[:totNeighbours]
            
            predictedLabel[i] = Counter(closestNeighbours).most_common(1)[0][0]
        return(predictedLabel)
    
    # Calculate euclidean distance
    def calculateDistance(self, trainingData):
        totalData = trainingData.shape[0]
        totalSelfData = self.dataTrain.shape[0]
        
        dotProduct = np.dot(trainingData, self.dataTrain.T)
        sumSquareTrain = np.square(trainingData).sum(axis = 1)
        sumSquareSelfTrain = np.square(self.dataTrain).sum(axis = 1)
        
        totalDistance = np.sqrt(-2 * dotProduct + sumSquareSelfTrain + np.matrix(sumSquareTrain).T)
        
        return(totalDistance)


In [None]:
'''LOGISTIC REGRESSION'''

'''
Categorize the data into different classes.

'''
import matplotlib.pyplot as pl

# Computes probability of test samples belonging to a particular class
def softmaxFunction(a):
    return(np.exp(a) / np.exp(a).sum(axis = 1, keepdims = True))

# Accelerate convergence to optimize performance  
def softmaxGradientFunction(trainingData, trainingLabel, randomData):
    SM = softmaxFunction(trainingData.dot(randomData))
    var_a = range(trainingData.shape[0])
    SM[var_a, trainingLabel] -= 1
    return(trainingData.T.dot(SM) / trainingData.shape[0])

# Check the performance of model by calculating error
def softmaxLossFunction(trainingData, trainingDabel, randomData):
    SM = softmaxFunction(trainingData.dot(randomData))
    var_a = range(trainingData.shape[0])
    return(-np.mean(np.log(SM[var_a, trainingLabel])))

'''
Tune the parameters such as learning rate, epoch, batch size etc.
to properly fit the learning curve.

'''
def softmaxFitFunction(trainingData, trainingLabel, randomData, learningRate = 0.01, no_of_epochs = 100, tolerance = 1e-5, batchsize = 10):
    old_random_data = randomData.copy()
    index = 0
    lossHistory = [softmaxLossFunction(trainingData, trainingLabel, randomData)]
    totalRecords = trainingData.shape[0]
    totalBatches = int(np.ceil(float(totalRecords) / batchsize))
    
    while index < no_of_epochs:
        index = index + 1
        randData = np.random.permutation(totalRecords)
        
        for i in range(totalBatches):
            id_batch = randData[batchsize * i:min(batchsize * (i + 1), totalRecords)]
            dataBatch, labelBatch = trainingData[id_batch], trainingLabel[id_batch]
            randomData -= learningRate * softmaxGradientFunction(dataBatch, labelBatch, randomData)
        
        lossHistory.append(softmaxLossFunction(trainingData, trainingLabel, randomData))
        
        if(np.linalg.norm((randomData - old_random_data) / randomData.size) < tolerance):
            break
        
        old_random_data = randomData.copy()
        
    return(randomData, lossHistory)

# Predict classes
def predictionFunction(randomData, trainingData):
    return(np.argmax(softmaxFunction(trainingData.dot(randomData)), axis = 1))

In [None]:
# Calculate accuracy of the model
# Accuracy = (Number of correct classifications / Total number of test examples used) * 100

def calculateAccuracy(testingLabel, predictedLabel):
        totalCount = predictedLabel == testingLabel
        return((totalCount.sum() / len(totalCount)) * 100)

In [None]:
#choose your classifier

selectClassifier = input('1) K-NEAREST NEIGHBOURS \n2) LOGISTIC REGRESSION\n\nPlease enter your choice and press enter: \n' + '\n')

if(selectClassifier == '1'):
    print('KNN CLASSIFIER:')
    
    batchSize = 2000
    classifierModel = KNN_Classifier()
    classifierModel.getData(trainingData, trainingLabel)

    predictedLabel = []

    # Calculate start time
    startTime = datetime.datetime.now()
    
    # train the data
    for i in range(int(len(testingData) / (2 * batchSize))):
        tempList  = classifierModel.predictionFunction(testingData[i * batchSize:(i + 1) * batchSize])
        predictedLabel = predictedLabel + list(tempList)

    # Predict labels
    for i in range(int(len(testingData) / (2 * batchSize)), int(len(testingData) / batchSize)):
        tempList  = classifierModel.predictionFunction(testingData[i * batchSize:(i + 1) * batchSize])
        predictedLabel = predictedLabel + list(tempList)

    # Calculate end time
    endTime = datetime.datetime.now()
    
    resultantAccuracy = calculateAccuracy(testingLabel, predictedLabel)
    print('The accuracy of the model with KNN is: %.2f' % resultantAccuracy, '%')

    # Creating output file contatining predicted labels
    with h5py.File('../Output/predicted_labels.h5','w') as H:
        H.create_dataset('output', data = predictedLabel)

    print('Output file containing predicted labels is created by name "predicted_labels.h5" for KNN Classifier')
    print("Total time taken by KNN classifier is", (endTime - startTime).seconds, 'seconds.')
    
elif(selectClassifier == '2'):
    print('LOGISTIC REGRESSION:')
    '''
    # Train the model
    # Compute time taken to execute: (endTime - startTime)
    '''
    
    randomData = np.random.randn(trainingData.shape[1], 10)

    # Calculate start time
    startTime = datetime.datetime.now()

    #train the data
    resultantData, lossHistory = softmaxFitFunction(trainingData, trainingLabel, randomData)

    # Calculate end time
    endTime = datetime.datetime.now()
    
    predictedLabel = predictionFunction(resultantData, testingData)
    resultantAccuracy = calculateAccuracy(predictedLabel, testingLabel)

    print('The accuracy of the model with Logistic Regression is: %.2f' % resultantAccuracy, '%')

    print("Total time taken by Logistic Regression classifier model is", (endTime - startTime).seconds, 'seconds.')
    
else: print('Invalid Choice')