<a href="https://colab.research.google.com/github/martinpius/iml_exercise/blob/main/ML_Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from timeit import default_timer as timer
t1 = timer()
try:
  from google.colab import drive
  drive.mount("/content/drive/", force_remount = True)
  import numpy as np
  import torch
  from sklearn.model_selection import train_test_split
  import matplotlib.pyplot as plt
  from sklearn.datasets import make_blobs
  print(f">>>> You are in CoLaB with numpy version: {np.__version__}")
except Exception as e:
  print(f">>>> {e}: {e}\n>>>> Please correct {type(e)} and reload your drive")

def mytimer(t: float = timer())->float:
  h = int(t / (60 * 60))
  m = int(t % (60 * 60) / 60)
  s = int(t % 60)
  return f"hrs: {h}, mins: {m:>02}, secs: {s:>05.2f}"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f">>>> Available device: {device}")
!nvidia-smi
print(f"\n>>>> Time elapsed\t: {mytimer(timer() - t1)}")


Mounted at /content/drive/
>>>> You are in CoLaB with numpy version: 1.22.4
>>>> Available device: cpu
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.


>>>> Time elapsed	: hrs: 0, mins: 00, secs: 31.00


In [3]:
# We implement various machine learning algorithms from scratch

In [14]:
class NearestNeighborClassifier():
  def __init__(self, k):
    self.k = k # k is the number of nearest neighbours
  
  def loader(self, xdata, target):
    '''''
    Loading the data
    '''''
    self.xdata = xdata
    self.target = target
  
  def knn_classifier(self, xtest, fast = None):
    """""
    Compute distances for the k-nearest neighbours and return predictions
    fast: Optional parameter to either use fast computing or double loop distance 
    """""
    if fast == True:
      distances = self.compute_fastproxy(xtest)
    else:
      distances = self.compute_proxy(xtest)
    ypreds = self.get_predictions(distances)
    return ypreds
  
  def compute_proxy(self, xtest):
    """""
    Use double loop: Inefficient computing option
    """""
    num_test = xtest.shape[0] # grab the number of exasamples in a test set
    num_train = self.xdata.shape[0] # grab number of examples in a train set
    distances = np.zeros((num_test, num_train)) # Place-holder matrix with row represent test example and cols associated proxies from train
    for i in range(num_test):
      for j in range(num_train):
        distances[i, j] = np.sqrt(np.sum((xtest[i,:] - self.xdata[j,:])**2)) # Compute the L2 distance
    return distances
  
  def compute_fastproxy(self, xtest):
    """""
    Use single loop : More efficient computing method:
    """""
    num_test = xtest.shape[0]
    num_train = self.xdata.shape[0]
    distances = np.zeros((num_test, num_train))
    for i in range(num_test):
      distances[i,:] = np.sqrt(np.sum((self.xdata - xtest[i,:])**2, axis = 1))
    return distances
  
  def get_predictions(self, distances):
    """""
    Use the L2 metric to obtain the most probable class for any new example
    """""
    pred_num = distances.shape[0] # grab total number of predictions
    ypreds = np.zeros(pred_num) # Place holder for the predictions
    for i in range(pred_num):
      pred_indices = np.argsort(distances[i,:]) # returns the indices of the distances of the pred array in ascending order
      knn = self.target[pred_indices[: self.k]].astype(int) # Slice the first k nearest neighbours based of the distances
      ypreds[i] = np.bincount(knn).argmax() # Fetch the most frequently prediction group/class
    return ypreds




In [15]:
if __name__ == "__main__":
  # Loading the data from my google drive
  X = np.loadtxt("/content/drive/MyDrive/Machine-Learning-Collection-master/ML/algorithms/knn/example_data/data.txt", delimiter = ",")
  y = np.loadtxt("/content/drive/MyDrive/Machine-Learning-Collection-master/ML/algorithms/knn/example_data/targets.txt")
  tic = timer()
  myknn = NearestNeighborClassifier(k = 3) # Instantiate with k = 3
  myknn.loader(X, y) # Loading the data into our classifier
  preds = myknn.knn_classifier(X, fast = True)
  acc = np.equal(preds, y).mean()
  print(f">>>> The accuracy score for k = 3 on the training set is: {acc:.2f} %")
  print(f">>>> Time elapsed: {mytimer(timer() - tic)}")


>>>> The accuracy score for k = 3 on the training set is: 0.93 %
>>>> Time elapsed: hrs: 0, mins: 00, secs: 00.00


In [24]:
class LogReg():
  def __init__(self, X, LR = 1e-2, iter = 10000):
    self.LR = LR
    self.iter = iter
    self.m, self.n = X.shape # Unpacking number of examples and features
  
  def trainer(self, X, y):
    # initialize the weights to zeros
    self.w = np.zeros((self.n, 1))
    self.b = 0
    # Iterate over the epochs
    for k in range(self.iter + 1):
      ypred = self.logits(np.dot(X, self.w) + self.b) # Compute the logits
      loss = -1/self.m * np.sum(y * np.log(ypred) + (1 - y)* np.log(1 - ypred))
      # compute the gradients of the loss wrt parameters
      dw = 1/self.m * np.dot(X.T, (ypred-y))
      db = 1/self.m * np.sum(ypred - y)
      # performs gradient descent to update the weights
      self.w -= dw * self.LR
      self.b -= db * self.LR
      # Print the loss at every 1000 epochs for monitoring the training
      if k % 1000 == 0:
        print(f">>>> The training loss at epoch {k} is: {loss:.4f}")
      # returning the weights to use later for the predictions
    return self.w, self.b 

  def predict(self, xtest):
    ypred = self.logits(np.dot(xtest, self.w) + self.b)
    pred_labels = ypred > 0.5 # same as np.round(np.sigmoid(ypred))
    return pred_labels
  
  def logits(self, z):
    sigm = 1 / (1 + np.exp(-z))
    return sigm

if __name__ == "__main__":
  tic = timer()
  print(f">>>> Loading the toy data from sklearn library\n")
  X, y = make_blobs(n_samples = 10000, n_features = 2, centers = 2) # Binary classfification problem
  y = y[:, np.newaxis] # Reshape the rank1 array into (batch, 1)
  print(f"\n>>>> Splitting the data into train-test")
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
  print(f">>>> X_train shape: {X_train.shape}, y_train shape: {y_train.shape}\
  \n>>>> X_test shape: {X_test.shape}, y_test_shape: {y_test.shape}")
  print(f"\n>>>> Training the LogReg classifier:")
  lr = LogReg(X)
  lr.trainer(X_train, y_train)
  #print(f"\n>>>> Computing the predictions")
  pred_labs = lr.predict(X_test)
  print(f"\n>>>> Compute the accuracy of {lr.__class__.__name__} classifier")
  acc = np.equal(pred_labs, y_test).mean()
  print(f"\n>>>> The accuracy on the test data: {acc* 100:.2f} %")
  print(f"\n>>>> Time elapsed: {mytimer(timer() - tic)}")



>>>> Loading the toy data from sklearn library


>>>> Splitting the data into train-test
>>>> X_train shape: (7000, 2), y_train shape: (7000, 1)  
>>>> X_test shape: (3000, 2), y_test_shape: (3000, 1)

>>>> Training the LogReg classifier:
>>>> The training loss at epoch 0 is: 0.4852
>>>> The training loss at epoch 1000 is: 0.0813
>>>> The training loss at epoch 2000 is: 0.0632
>>>> The training loss at epoch 3000 is: 0.0561
>>>> The training loss at epoch 4000 is: 0.0522
>>>> The training loss at epoch 5000 is: 0.0496
>>>> The training loss at epoch 6000 is: 0.0478
>>>> The training loss at epoch 7000 is: 0.0464
>>>> The training loss at epoch 8000 is: 0.0453
>>>> The training loss at epoch 9000 is: 0.0444
>>>> The training loss at epoch 10000 is: 0.0436

>>>> Compute the accuracy of LogReg classifier

>>>> The accuracy on the test data: 97.47 %

>>>> Time elapsed: hrs: 0, mins: 00, secs: 08.00
