<a href="https://colab.research.google.com/github/nissi31/fmml-projects-and-labs-Nissi/blob/main/FMML_Lab6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from keras.datasets import mnist

In [None]:
def NN1(traindata, trainlabel, query):
    """
    This function takes in the training data, training labels and a query point
    and returns the predicted label for the query point using the nearest neighbour algorithm

    traindata: numpy array of shape (n,d) where n is the number of samples and d is the number of features
    trainlabel: numpy array of shape (n,) where n is the number of samples
    query: numpy array of shape (d,) where d is the number of features

    returns: the predicted label for the query point which is the label of the training data which is closest to the query point
    """
    diff = (
        traindata - query
    )  # find the difference between features. Numpy automatically takes care of the size here
    sq = diff * diff  # square the differences
    dist = sq.sum(1)  # add up the squares
    label = trainlabel[np.argmin(dist)]
    return label


def NN(traindata, trainlabel, testdata):
    """
    This function takes in the training data, training labels and test data
    and returns the predicted labels for the test data using the nearest neighbour algorithm

    traindata: numpy array of shape (n,d) where n is the number of samples and d is the number of features
    trainlabel: numpy array of shape (n,) where n is the number of samples
    testdata: numpy array of shape (m,d) where m is the number of test samples and d is the number of features

    returns: the predicted labels for the test data which is the label of the training data which is closest to each test point
    """
    predlabel = np.array([NN1(traindata, trainlabel, i) for i in testdata])
    return predlabel


def Accuracy(gtlabel, predlabel):
    """
    This function takes in the ground-truth labels and predicted labels
    and returns the accuracy of the classifier

    gtlabel: numpy array of shape (n,) where n is the number of samples
    predlabel: numpy array of shape (n,) where n is the number of samples

    returns: the accuracy of the classifier which is the number of correct predictions divided by the total number of predictions
    """
    assert len(gtlabel) == len(
        predlabel
    ), "Length of the ground-truth labels and predicted labels should be the same"
    correct = (
        gtlabel == predlabel
    ).sum()  # count the number of times the groundtruth label is equal to the predicted label.
    return correct / len(gtlabel)


def cumArray(img):
    img2 = img.copy()
    for ii in range(1, img2.shape[1]):
        # for every row, add up all the rows above it.
        img2[ii, :] = img2[ii, :] + img2[ii - 1, :]
    img2 = img2 > 0
    return img2


def getHolePixels(img):
    """
    This function takes in a binary image and returns the pixels that are holes in the image

    img: numpy array of shape (n,m) where n is the height of the image and m is the width of the image

    returns: a binary image of the same shape as the input image where the holes are filled in
    """
    im1 = cumArray(img)
    # rotate and cumulate it again for differnt direction
    im2 = np.rot90(cumArray(np.rot90(img)), 3)
    im3 = np.rot90(cumArray(np.rot90(img, 2)), 2)
    im4 = np.rot90(cumArray(np.rot90(img, 3)), 1)
    # this will create a binary image with all the holes filled in.
    hull = im1 & im2 & im3 & im4
    # remove the original digit to leave behind the holes
    hole = hull & ~(img > 0)
    return hole


def getHullPixels(img):
    """
    This function takes in a binary image and returns the pixels that are the convex hull of the image

    img: numpy array of shape (n,m) where n is the height of the image and m is the width of the image

    returns: a binary image of the same shape as the input image where the convex hull is filled in
    """
    im1 = cumArray(img)
    # rotate and cumulate it again for differnt direction
    im2 = np.rot90(cumArray(np.rot90(img)), 3)
    im3 = np.rot90(cumArray(np.rot90(img, 2)), 2)
    im4 = np.rot90(cumArray(np.rot90(img, 3)), 1)
    # this will create a binary image with all the holes filled in.
    hull = im1 & im2 & im3 & im4
    return hull


def minus(a, b):
    """
    This function takes in two binary images and returns the difference between the two images
    """
    return a & ~b


def getBoundaryPixels(img):
    """
    This function takes in a binary image and returns the pixels that are the boundary of the image

    img: numpy array of shape (n,m) where n is the height of the image and m is the width of the image

    returns: a binary image of the same shape as the input image where the boundary is filled in
    """
    img = img.copy() > 0  # binarize the image
    rshift = np.roll(img, 1, 1)
    lshift = np.roll(img, -1, 1)
    ushift = np.roll(img, -1, 0)
    dshift = np.roll(img, 1, 0)
    boundary = (
        minus(img, rshift)
        | minus(img, lshift)
        | minus(img, ushift)
        | minus(img, dshift)
    )
    return boundary

In [None]:
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X, test_X = train_X / 255, test_X / 255

nclasses = 4
train_X, train_y = train_X[train_y < nclasses], train_y[train_y < nclasses]
test_X, test_y = test_X[test_y < nclasses], test_y[test_y < nclasses]

train_X, train_y = train_X[::100].copy(), train_y[::100].copy()
test_X, test_y = test_X[::100].copy(), test_y[::100].copy()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Extract all features
train_hole = np.array([getHolePixels(i).sum() for i in train_X])
test_hole = np.array([getHolePixels(i).sum() for i in test_X])
train_bound = np.array([getBoundaryPixels(i).sum() for i in train_X])
test_bound = np.array([getBoundaryPixels(i).sum() for i in test_X])
train_hull = np.array([getHullPixels(i).sum() for i in train_X])
test_hull = np.array([getHullPixels(i).sum() for i in test_X])
train_sum = np.sum(train_X, (1, 2)) / (28 * 28)
test_sum = np.sum(test_X, (1, 2)) / (28 * 28)

In [None]:
# Function to test different feature combinations
def test_feature_combination(train_features, test_features, feature_names):
    train_feats = np.column_stack(train_features)
    test_feats = np.column_stack(test_features)

    test_pred = NN(train_feats, train_y, test_feats)
    acc = Accuracy(test_y, test_pred)
    print(f"Accuracy with features {feature_names}: {acc*100:.2f}%")


In [None]:
# Two features
test_feature_combination([train_hole, train_bound], [test_hole, test_bound], ["hole", "boundary"])
test_feature_combination([train_hole, train_hull], [test_hole, test_hull], ["hole", "hull"])
test_feature_combination([train_hole, train_sum], [test_hole, test_sum], ["hole", "sum"])
test_feature_combination([train_bound, train_hull], [test_bound, test_hull], ["boundary", "hull"])
test_feature_combination([train_bound, train_sum], [test_bound, test_sum], ["boundary", "sum"])
test_feature_combination([train_hull, train_sum], [test_hull, test_sum], ["hull", "sum"])

Accuracy with features ['hole', 'boundary']: 76.19%
Accuracy with features ['hole', 'hull']: 73.81%
Accuracy with features ['hole', 'sum']: 69.05%
Accuracy with features ['boundary', 'hull']: 69.05%
Accuracy with features ['boundary', 'sum']: 64.29%
Accuracy with features ['hull', 'sum']: 66.67%


In [None]:
# Three features
test_feature_combination([train_hole, train_bound, train_hull], [test_hole, test_bound, test_hull], ["hole", "boundary", "hull"])
test_feature_combination([train_hole, train_bound, train_sum], [test_hole, test_bound, test_sum], ["hole", "boundary", "sum"])
test_feature_combination([train_hole, train_hull, train_sum], [test_hole, test_hull, test_sum], ["hole", "hull", "sum"])
test_feature_combination([train_bound, train_hull, train_sum], [test_bound, test_hull, test_sum], ["boundary", "hull", "sum"])

Accuracy with features ['hole', 'boundary', 'hull']: 71.43%
Accuracy with features ['hole', 'boundary', 'sum']: 76.19%
Accuracy with features ['hole', 'hull', 'sum']: 71.43%
Accuracy with features ['boundary', 'hull', 'sum']: 69.05%


In [None]:
# All four features
test_feature_combination([train_hole, train_bound, train_hull, train_sum], [test_hole, test_bound, test_hull, test_sum], ["hole", "boundary", "hull", "sum"])

Accuracy with features ['hole', 'boundary', 'hull', 'sum']: 71.43%
