<a href="https://colab.research.google.com/github/narendra-mds/CS5660/blob/main/CS5660_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tarfile
import os
from sklearn.preprocessing import OneHotEncoder

In [2]:
current_directory = os.getcwd()

print("Current working directory:", current_directory)

Current working directory: /content


In [3]:
def load_fvecs_from_tar(tar_filename, fvecs_filename):
    with tarfile.open(tar_filename, 'r') as tar:
        # Extract the fvecs file from the tar archive
        fvecs_file = tar.extractfile(fvecs_filename)
        with fvecs_file as f:
          fv = np.frombuffer(f.read(), dtype=np.float32)
          if fv.size == 0:
            return np.zeros((0, 0))
        dim = fv.view(np.int32)[0]
        fv = fv.reshape(-1, 1 + dim)
        if not all(fv.view(np.int32)[:, 0] == dim):
          raise IOError("Non-uniform vector sizes in " + fvecs_file)
        fv = fv[:, 1:]
        fv = fv.copy()
    return fv

In [4]:
def load_ivecs_from_tar(tar_filename, ivecs_filename):
    with tarfile.open(tar_filename, 'r') as tar:
        # Extract the ivecs file from the tar archive
        ivecs_file = tar.extractfile(ivecs_filename)
        with ivecs_file as f:
          a = np.frombuffer(f.read(), dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().reshape(-1)

## Read the Images and Labels

Note that the compressed .tgz file is uploaded to session data i.e. in \content

In [5]:
tar_file = 'groupFungus_k64_nclass10_nex10.tgz'
folder = 'example_data'
fungus10_train_images = os.path.join(folder, 'groupFungus_k64_nclass10_nex10_Xtrain.fvecs')
fungus10_train_labels = os.path.join(folder, 'groupFungus_k64_nclass10_nex10_Ltrain.ivecs')

In [6]:
fungus10_test_images = os.path.join(folder, 'groupFungus_k64_nclass10_nex10_Xtest.fvecs')
fungus10_test_labels = os.path.join(folder, 'groupFungus_k64_nclass10_nex10_Ltest.ivecs')

In [7]:
fungus10_train_images_features = load_fvecs_from_tar(tar_file, fungus10_train_images)
fungus10_test_images_features = load_fvecs_from_tar(tar_file, fungus10_test_images)

In [8]:
fungus10_train_images_features.shape, fungus10_test_images_features.shape

((100, 4096), (100, 4096))

In [9]:
fungus10_train_images_labels = load_ivecs_from_tar(tar_file, fungus10_train_labels)
fungus10_test_images_labels = load_ivecs_from_tar(tar_file, fungus10_test_labels)

In [10]:
fungus10_train_images_labels.shape, fungus10_test_images_labels.shape

((100,), (100,))

## Build a Linear model as described in the paper

First create a one hot encoded array for true values

In [21]:
# define one hot encoding
encoder = OneHotEncoder(sparse_output=False)
# transform data
y_train_true = encoder.fit_transform(fungus10_train_images_labels.reshape(100,1))
print(y_train_true[0])

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Now build X matrix and Weight matrix based on dimensions of the data

In [32]:
X_train = fungus10_train_images_features

In [33]:
X_train.shape

(100, 4096)

Weight matrix would have:


*   rows = no. of features
*   columns = no. of classes

In [40]:
np.random.seed(5660)
_,m = fungus10_train_images_features.shape
n = len(set(fungus10_train_images_labels))
W = np.random.uniform(low=-1, high=1, size=(m, n))
b = np.random.uniform(low=-1, high=1, size=(1, n))

In [41]:
W.shape

(4096, 10)

Initial set of weights

In [42]:
print(W[:2])

[[ 0.44140127  0.4690655  -0.73259887 -0.05906308 -0.53820052  0.8147641
  -0.29529803 -0.04197371 -0.93145758  0.22377421]
 [ 0.92699897  0.97547102  0.64151833 -0.70370343 -0.62292231 -0.03680328
   0.6366819   0.89381421  0.26542717  0.27011698]]


In [43]:
print(b)

[[ 0.56534233  0.93943885  0.91182578 -0.06911842 -0.09695704  0.69054724
  -0.8612357  -0.32716141 -0.48437092 -0.82968288]]


We multiply X (100,4096) with W(4096,10) and add b


In [44]:
def net_input(X, W, b):
    return (X.dot(W) + b)

net_in = net_input(X_train, W, b)
print(f'net input shape:\n {net_in.shape}')
print(f'net input:\n {net_in[:2]}')

net input shape:
 (100, 10)
net input:
 [[ 0.6954171   0.77256549  0.18707491 -0.08701527  0.17142897  0.9085162
  -0.27257828 -0.1387958   0.57632682 -0.92504999]
 [ 0.30649441  0.82180241  0.71058519  0.39058976 -0.93494618  0.8043387
   0.41005151 -0.31011337  0.16902247 -1.34038849]]


In [46]:
def softmax(z):
    return (np.exp(z.T) / np.sum(np.exp(z), axis=1)).T

smax = softmax(net_in)
print(f'softmax:\n {smax[:2]}')

softmax:
 [[0.14559767 0.15727494 0.08757582 0.0665807  0.08621628 0.18017817
  0.05530436 0.06322085 0.12925108 0.02880013]
 [0.10093817 0.16898605 0.15119931 0.10979374 0.02916786 0.16606054
  0.11195145 0.05448363 0.08797355 0.01944569]]


To get class labels from Probabilities

In [47]:
def to_classlabel(z):
    return z.argmax(axis=1)

print(f'predicted class labels:  {to_classlabel(smax)}')

predicted class labels:  [5 1 2 1 1 4 2 2 0 1 1 2 2 2 5 0 0 1 0 0 1 2 1 2 5 2 2 5 1 5 0 0 5 1 1 5 1
 1 0 1 2 5 1 1 2 2 1 1 5 0 1 1 1 5 5 1 2 5 1 1 2 1 0 1 1 1 1 2 2 5 5 5 2 1
 2 1 4 1 1 5 1 2 5 5 2 3 1 8 1 2 5 0 5 1 1 5 1 2 1 5]


In [63]:
def logistic_loss(X,W,b,y_true):
  # print(X)
  # print(y_true)
  prob_scores = softmax(net_input(X, W, b))
  class_ = np.argmax(y_true)
  # print(prob_scores)
  # print(class_)
  numerator = np.exp(prob_scores[0][class_-1])
  # print(numerator)
  denominator = sum([np.exp(prob_scores[0][x]) for x in range(len(prob_scores[0]))])
  # print(denominator)
  return -np.log(numerator/denominator)

In [64]:
logistic_loss(X=X_train[0],W=W,b=b, y_true=y_train_true[0])

2.37491562302489

In [65]:
def loss(X, W, b, y_true):
  return np.mean([logistic_loss(X=X_train[i],W=W,b=b, y_true=y_train_true[i]) for i in range(len(X))])

In [66]:
loss(X_train, W, b, y_train_true)

2.3140300967771785