In [None]:
# Depends on:
import pandas as pd
import numpy as np
import math
import random
from collections import Counter

# Benchmark
import time

# ⚠️ WARNING: Disable this if you're running locally (not on Google Colab)
from google.colab import files

In [None]:
def loadDataset(link: str, sheetName: str) -> pd.DataFrame:
  return pd.read_excel(link, sheet_name=sheetName)

In [None]:
def computeConfusionMatrix(actual, predicted):
  matrix = {
    'tp': 0,
    'fp': 0,
    'fn': 0,
    'tn': 0
  }

  for i in range(len(actual)):
    if actual[i] == 1 and predicted[i] == 1:
      matrix['tp'] += 1
    elif actual[i] == 0 and predicted[i] == 1:
      matrix['fp'] += 1
    elif actual[i] == 1 and predicted[i] == 0:
      matrix['fn'] += 1
    else:
      matrix['tn'] += 1
  
  return matrix

In [None]:
def getPerfomanceMatrix(actual, predicted):
  matrix = computeConfusionMatrix(actual, predicted)

  return {
      'accuracy': ((matrix['tp'] + matrix['tn']) / (matrix['tp'] + matrix['fp'] + matrix['fn'] + matrix['tn'])) * 100.0,
      'recall': ((matrix['tp']) / (matrix['tp'] + matrix['fn'])) * 100.0,
      'precision': ((matrix['tp']) / (matrix['tp'] + matrix['fp'])) * 100.0
  }

In [None]:
def computeEuclidean(trainRow: list, testRow: list) -> float:
  distance = 0.0

  for i in range(len(trainRow) - 1):
    distance += (trainRow[i] - testRow[i]) ** 2
  
  return math.sqrt(distance)

In [None]:
def getNeighbors(trainSet, testRow, numOfNeighbor: int) -> list:
  distances = list()
  neighbors = list()

  for trainRow in trainSet:
    distance = computeEuclidean(trainRow, testRow)
    distances.append((trainRow, distance))

  distances.sort(key=lambda tpl: tpl[1])

  for i in range(numOfNeighbor):
    neighbors.append(distances[i][0])

  return neighbors

In [None]:
def neighborsVote(votes: list) -> int:
  yes = 0
  no = 0

  for vote in votes:
    if vote == 1:
      yes += 1
    else:
      no += 1
  
  if yes >= no:
    return 1
  else:
    return 0

In [None]:
def predictClassification(trainSet, testRow, numOfNeighbor: int) -> int:
  neighbors = getNeighbors(trainSet, testRow, numOfNeighbor)
  outputValues = [row[-1] for row in neighbors]
  prediction = neighborsVote(outputValues)

  return prediction

In [None]:
def crossValidation(dataset, numOfFolds):
	datasetSplit = list()
	datasetCopy = list(dataset)
	foldSize = int(len(dataset) / numOfFolds)
 
	for _ in range(numOfFolds):
		fold = list()
  
		while len(fold) < foldSize:
			index = random.randrange(len(datasetCopy))
			fold.append(datasetCopy.pop(index))
   
		datasetSplit.append(fold)
	return datasetSplit

In [None]:
def evaluate(dataset, algorithm, numOfFolds, *args):
	folds = crossValidation(dataset, numOfFolds)
	scores = list()
 
	for fold in folds:
		trainSet = list(folds)
		trainSet.remove(fold)
		trainSet = sum(trainSet, [])
		testSet = list()
  
		for row in fold:
			row_copy = list(row)
			testSet.append(row_copy)
			row_copy[-1] = None

		predicted = algorithm(trainSet, testSet, *args)
		actual = [row[-1] for row in fold]
		accuracy = getPerfomanceMatrix(actual, predicted)['accuracy']
		scores.append(accuracy)
  
	return scores

In [None]:
def knn(train, test, numOfNeighbor):
  return [predictClassification(train, row, numOfNeighbor) for row in test]

In [None]:
def splitTrainAndEvaluation(dataset):
  TRAIN_SET_PERCENTAGE = 0.9
  VALIDATION_SET_PERCENTAGE = 1 - TRAIN_SET_PERCENTAGE

  DATASET_COUNT = len(dataset)

  TRAIN_SET_SIZE = math.floor(DATASET_COUNT*TRAIN_SET_PERCENTAGE)
  VALIDATION_SET_SIZE = math.ceil(DATASET_COUNT*VALIDATION_SET_PERCENTAGE)

  validationResult = dataset[0:VALIDATION_SET_SIZE]
  trainSet = dataset[-TRAIN_SET_SIZE:]
  
  validationSet = [[value for value in row] for row in validationResult]

  return trainSet, validationSet, [y[4] for y in validationResult]

In [None]:
def main():
  print("Hello, mom!")

  start = time.time()

  DATASET_URL = "https://raw.githubusercontent.com/mrandika/CII2M3_INTRO-AI_Learning/main/traintest.xlsx"
  #DATASET_URL = "/content/traintest.xlsx"
  N_FOLDS = 3
  K_COUNT = 4

  # Plain dataset
  rawDataset = loadDataset(DATASET_URL, 'train').to_numpy()
  dataset = [[value for value in row] for row in rawDataset]

  # Get train and validation set
  trainSet_dirty, validationSet_dirty, validationResult = splitTrainAndEvaluation(dataset)

  # Cleansing, Remove ID column
  trainSet = [row[-4:] for row in trainSet_dirty]
  validationSet = [row[-4:] for row in validationSet_dirty]

  # Learning Evaluation
  scores = evaluate(trainSet, knn, N_FOLDS, K_COUNT)

  print("\n--- Train Set, Model Evaluation ---")
  print(f"Data count:", len(trainSet))
  for i in range(N_FOLDS):
    print(f"Fold No.", i+1, "scores:", scores[i])
  print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

  # Validation
  validationResults = knn(trainSet, validationSet, K_COUNT)
  perfomanceMatrix = getPerfomanceMatrix(validationResult, validationResults)

  print("\n--- Validation Set ---")
  print(f"Data count:", len(validationSet))
  for key, value in perfomanceMatrix.items():
    print("{}: {}".format(key, value), sep='')

  print("{:<5} {:<5} {:<5} {:<5} {:<7} {:<7}".format("ID", "x1", "x2", "x3", "Actual", "Expected"))
  for i in range(len(validationResults)):
    print("{:<5} {:<5} {:<5} {:<5} {:<7} {:<7}".format(validationSet_dirty[i][0], validationSet[i][0], validationSet[i][1], validationSet[i][2], validationResults[i], validationResult[i]))

  # Test Set
  testData = loadDataset(DATASET_URL, 'test')
  testSet = [[value for value in row[-4:]] for row in testData.to_numpy()]
  testResults = knn(trainSet, testSet, K_COUNT)

  for i in range(len(testResults)):
    testSet[i][3] = testResults[i]

  print("\n--- Test Set ---")
  print(f"Data count:", len(testSet))

  print("{:<5} {:<5} {:<5} {:<5} {:<7}".format("ID", "x1", "x2", "x3", "Given Label"))
  for i in range(len(testResults)):
    print("{:<5} {:<5} {:<5} {:<5} {:<7}".format(testData.to_numpy()[i][0], testSet[i][0], testSet[i][1], testSet[i][2], testSet[i][3]))

  # Recap
  results = [[testData.to_numpy()[i][0], testSet[i][0], testSet[i][1], testSet[i][2], testSet[i][3]] for i in range(len(testSet))]

  resultDataFrame = pd.DataFrame(results, columns=["ID", "x1", "x2", "x3", "y"])
  resultDataFrame.to_excel('test_results.xlsx')

  # ⚠️ WARNING: Disable this if you're running locally (not on Google Colab)
  files.download('test_results.xlsx')

  print(f"\nElapsed:", time.time() - start)

In [None]:
if __name__ == "__main__":
  main()

Hello, mom!

--- Train Set, Model Evaluation ---
Data count: 200
Fold No. 1 scores: 75.75757575757575
Fold No. 2 scores: 81.81818181818183
Fold No. 3 scores: 72.72727272727273
Mean Accuracy: 76.768%

--- Validation Set ---
Data count: 30
accuracy: 63.33333333333333
recall: 76.19047619047619
precision: 72.72727272727273
ID    x1    x2    x3    Actual  Expected
1     60    64    0     1       1      
2     54    60    11    0       0      
3     65    62    22    1       0      
4     34    60    0     1       1      
5     38    69    21    1       0      
6     33    58    10    1       1      
7     63    61    0     1       1      
8     57    64    0     1       1      
9     46    58    3     1       1      
10    43    65    0     1       1      
11    60    59    17    0       0      
12    70    59    8     0       1      
13    69    60    0     0       1      
14    57    61    5     1       0      
15    67    61    0     0       1      
16    68    67    0     1       1     

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Elapsed: 0.3223278522491455
