In [0]:
import numpy as np
from collections import Counter

In [0]:
def preprocess(text):
  return(Counter(text.split()))

def make_counter(commons):
    return Counter({x[0]: x[1] for x in commons})

In [0]:
def distance(profile1, profile2):
  """
  calculates distance between 2 document profiles

  :param profile1: collections.Counter, token frequencies in profile 1
  :param profile2: collections.Counter, token frequencies in profile 2
  :return: float, distance
  """
  vocab = set(profile1.keys()).union(set(profile2.keys()))
  diffs = []
  for word in vocab:
    fp1 = profile1[word]
    fp2 = profile2[word]
    diff = ((fp1 - fp2)/((fp1 + fp2)/2)) ** 2
    diffs.append(diff)
  return np.sum(diffs)

In [0]:
def radius(di, u, A):
  """
  compares proximity between an unknown document and a known one
  :param di: collections.Counter, token frequencies in a known document i
  :param u: collections.Counter, token frequencies in an unknown document
  :param A: list of collections.Counter, token frequencies of all know documents
  :return: float, distance
  """
  return(distance(di, u)/np.max([distance(di, ai) for ai in A]))

In [0]:
def radius_distance(u, A):
  """
  compares proximity between an unknown document and all known ones
  :param u: collections.Counter, token frequencies in an unknown document
  :param A: list of collections.Counter, token frequencies of all know documents
  :return: float, distance
  """
  dists = []
  for di in A:
    dists.append(radius(di, u, A))
  return np.mean(dists)

In [0]:
class SingleClassifier():

    def __init__(self, profiles, y_true, p_length):
        self.y_true = y_true
        self.p_length = p_length  
        self.profiles = self.truncate(profiles)
    
    def truncate(self, profiles):
        profiles = [profile.most_common(self.p_length) for profile in profiles]
        profiles = [make_counter(profile) for profile in profiles]
        return profiles
    
    def forward_one(self, x):  

        classes = np.unique(np.array(self.y_true))
        class_dist = {}
        for c in classes:
            y_ids = np.where(np.array(self.y_true)==c)  # Выбрать все мемберы этого класса кроме х
            y_c = np.array(self.profiles)[y_ids]
            
            if x in y_c:
                y_c = np.delete(y_c, np.argwhere(y_c==x))
            distance = radius_distance(x, y_c)  
            class_dist[c] = distance
        
        return min(class_dist, key=class_dist.get), class_dist

    def forward_all(self):
        return [self.forward_one(x) for x in self.profiles]

In [0]:
texts = ['I am so sad I am so tired', 'I am so useless and crying and tired so tired so tired', 
         'I am so sad so sad so sad', 'I am so sad and so tired',
         
         'Wow a velocaraptor', 'She is no longer a veloceraptor',
         'I wish we could be a velocaraptor', 'velocaraptor veloceraptor veloceraptor']

ys = ['sad', 'sad', 'sad', 'sad', 'dino', 'dino', 'dino', 'dino']

profs = [preprocess(text) for text in texts]

In [135]:
cl = SingleClassifier(profs, ys, 10)
cl.forward_one(preprocess(texts[0]))

('sad', {'dino': 1.0155122655122655, 'sad': 0.5214161917552966})

In [136]:
cl.forward_all()

[('sad', {'dino': 1.0155122655122655, 'sad': 0.5214161917552966}),
 ('dino', {'dino': 1.2467532467532467, 'sad': 2.2174809624122838}),
 ('sad', {'dino': 0.8961038961038962, 'sad': 0.7055166523861863}),
 ('sad', {'dino': 1.1298701298701297, 'sad': 0.4696971554169289}),
 ('dino', {'dino': 0.5367965367965367, 'sad': 1.9547890946953117}),
 ('dino', {'dino': 1.2037037037037037, 'sad': 2.6415204838078177}),
 ('dino', {'dino': 1.1913419913419914, 'sad': 2.418766139952783}),
 ('dino', {'dino': 0.5401635401635402, 'sad': 1.7258786316578099})]