In [4]:
from collections import Counter
import math

class knntextclassifier:
    def __init__(self, k=2):
        self.k = k
        self.train_data = []
        self.vocab = set()

    def preprocess_text(self, text):
        return text.lower().split()

    def vectorize(self, words):
        return Counter(words)

    def build_vocab(self, train_x):
        for text_input in train_x:
            protext = self.preprocess_text(text_input)
            self.vocab.update(protext)

    def fit(self, train_x, train_y):
        self.build_vocab(train_x)
        distance_pair = []
        for x, y in zip(train_x, train_y):
            protext = self.preprocess_text(x)
            textvector = self.vectorize(protext)
            distance_pair.append((textvector, y))
        return distance_pair

    def compute_distance(self, vec1, vec2):
        totaldis = 0
        for word in self.vocab:
            dis = (vec1.get(word,0) - vec2.get(word,0))**2
            totaldis+= dis
        return totaldis

    def predict(self, test_x):
        allres = []
        distance_pair = self.fit(train_x, train_y)
        for test_input in test_x:
            protext = self.preprocess_text(test_input)
            textvector = self.vectorize(protext)
            alldis = []
            for i in distance_pair:
                curdis = self.compute_distance(i[0],textvector)
                alldis.append((curdis,i[1]))
            alldis = sorted(alldis, key = lambda x: x[0])
            alldis = alldis[:self.k]
        
            k_nearest_labels = []
            for kk in range(self.k):
                k_nearest_labels.append(alldis[kk][1])
            most_common_label = Counter(k_nearest_labels)

            res = max(most_common_label, key = most_common_label.get)
            allres.append(res)
        return allres  





train_x = ['this is kk techology','this is kk techology 2', 'pet pet pet ']
train_y = ['tech', 'tech', 'pet']
test_x = ['kk techology ','pet pet pet']

knn_instance = knntextclassifier(2)
knn_instance.fit(train_x, train_y)
knn_instance.predict(test_x)

['tech', 'pet']