# Custom K-Nearest Neighbours
Classifies patients comparing them to others

In [1]:
import pandas as pd 
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from alive_progress import alive_bar
from IPython.display import clear_output


import threading, queue

%matplotlib widget

import common
from Data import JoinedData

cleanData = common.loadFile("CleanedData")
allData = JoinedData().data.drop(['LDELTOTAL', 'DIGITSCOR', 'PTEDUCAT', 'PTGENDER', 'AGE'], axis=1)  

**Removing cleanedData from allData**

In [2]:
for i, j in zip(cleanData.RID, cleanData.VISCODE):
    allData = allData[(allData.RID != i) & (allData.VISCODE != j)]
    
cleanData = cleanData.drop(['RID', 'VISCODE'], axis=1) # Fields not used
allData = allData[(allData.DX.notna()) & (allData.DX != '')] # Remove rows without diagnosis
allData = allData.drop([x for x in list(allData.columns) if x not in list(cleanData.columns)], axis=1) # Drops fields that don't exist in cleanData
allData.replace('', np.NaN, inplace=True)

In [3]:
def incompleteFitting(clean: pd.DataFrame, dirty: pd.DataFrame, k: int, missingColumns: int) -> list:

    # This algorithm is not optimized, runing from the end, we can exclude rows. A row that is missing 10 labels is also missing 9 to 0 labels.
    print(f"Calculating for missing columns: {missingColumns}")
    frequent = []
    for _, i in dirty.iterrows():
        labels = list(i[i.isna()].index) # Get nans from the current line

        if len(labels) >= missingColumns:
            continue

        row = i[(i.notna())].astype('float') # Drops nans from the current line
        currentCleanData = clean.drop(clean.filter(labels), axis=1).astype('float') # Drops nans from the complete dataset
        labels = list(i[i.notna()].index) # Get labels from the dataset in use
        labels.remove('DX')
        for n in labels:
            currentCleanData[n] = row[n] - currentCleanData[n]
        totals = sorted([list(x) for x in zip(currentCleanData.apply(lambda x: (x**2).sum(), axis=1), currentCleanData.DX.to_list())])
        best = [x[1] for x in totals[:k]] # gets DX from Totals

        best = max(set(best), key = best.count)

        frequent.append([best, row.DX, best == row.DX])
    return frequent

Since processing each interaction takes forever, the way was to create a DataFrame of nans and process and save each interaction. 
This way we can pick up where we left of and keep going from there.

In [4]:
try:
    common.loadFile("Accuracy")
except:
    accuracy = pd.DataFrame(np.nan, index=range(len(list(allData[allData.isna()].T.index))), columns=["Accuracy"])
    common.saveFile(accuracy, "Accuracy")

## Here we are going to use threads

In [5]:
# To prevent multiple writes on the file at the same time, we will use a semaphore
class incompleteCalc:
    def __init__(self, number_threads: int):
        self.KNNnumber = 5
        self.mutex = threading.Lock() # same as Semaphore started at 1
        self.stepComplete = 0 # Variable used for the progress bar
        self.number_threads = number_threads # Number of tasks in parallel
        self.queue = queue.Queue() # Queue to complete all tasks
        # ----- Initializing ----
        self.fillQueue()
        self.startThreads()
        
    
    def readFile(self) -> pd.DataFrame:
        return common.loadFile("Accuracy")

    def writeResult(self, res: float, index:int) -> None:
        """Saves the result to a file

        Args:
            res (float): result from the row
            index (int): index of the row
        """
        self.mutex.acquire()
        # Reading and wrinting the file in this case is a critical part of the code, mutex will prevent parallel writing

        accuracy = self.readFile()
        accuracy.Accuracy[index] = res
        common.saveFile(accuracy, "Accuracy")

        self.mutex.release()

    def getTasks(self) -> list:
        # Gets the indexes of all lines yet to be calculated
        # Since this class takes arround 72 hours to complete for 189 tasks, we are saving each step. 
        #This way we can cancel or stop the algorithm and pick up where we've left of
        self.mutex.acquire()
        file = self.readFile()
        self.mutex.release()
        return file[file.Accuracy.isna()].index.tolist() # Returns the indexes of NaNs


    def task(self) -> None:
        self.KNNnumber = 5 # Number of neighbours to count
        while not self.queue.empty(): # Keeps the task alive untill the queue runs out of tasks
            index = self.queue.get(block=True, timeout=0.05)
            res = incompleteFitting(cleanData, allData, self.KNNnumber, index)
            accuracy = [x[2] for x in res].count(True)/len(res) if len(res) != 0 else 0
            self.writeResult(accuracy, index)
            self.stepComplete += 1
            self.queue.task_done()


    def startThreads(self) -> None:
        # Creates workers for each work, bar is the thread for the progress bar, the others are the tasks to calculate for each missing columns
        bar = threading.Thread(target=self.updater)
        threads = [threading.Thread(target=self.task, daemon=True) for _ in range(self.number_threads)]
        bar.start()
        [thread.start() for thread in threads]
        [thread.join() for thread in threads]
        bar.join()

    def fillQueue(self) -> None:
        # Places all indexes to be calculated on the queue
        missing = self.getTasks()
        for i in missing:
            self.queue.put(i)
        print(f"Queue size: {self.queue.qsize()}")
        
    # A progress bar
    def updater(self) -> None:
        print("Alive Bar Starting...\n")
        current = 0
        with alive_bar(self.queue.qsize(), title="Processing: ", force_tty=True) as bar:
            while not self.queue.empty():
                if current < self.stepComplete:
                    current += 1
                    bar()
                time.sleep(10) # updates the bar each 10 secs - This task is not important so doesen't need to be always on the processor
    
    

In [6]:
acc = incompleteCalc(4).readFile()

Queue size: 0
Alive Bar Starting...

Processing:  |████████████████████████████████████████| 0 in 0.1s (0.00/s)                                                                                                                                          


In [7]:
acc

Unnamed: 0,Accuracy
0,0.000000
1,0.467459
2,0.474209
3,0.479954
4,0.481859
...,...
184,0.511346
185,0.511346
186,0.511346
187,0.511346


In [11]:
plt.figure(figsize = [20, 10])
sns.scatterplot(data=acc).set(title="Accuracy Plot", xlabel="Missing Columns", ylabel="Accuracy")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[Text(0.5, 1.0, 'Accuracy Plot'),
 Text(0.5, 0, 'Missing Columns'),
 Text(0, 0.5, 'Accuracy')]