<a href="https://colab.research.google.com/github/ovbystrova/Interference/blob/master/Class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Author Verification Using Common N-Gram Profiles of Text Documents](https://www.aclweb.org/anthology/C14-1038.pdf)

In [0]:
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## Distances

### Implement the distance formula from the article:
$$D(P_1,P_2)=\sum_{x\in(P_1 \cup P_2)}\left(\frac{fP_1(x)-fP_2(x)}{\frac{1}{2} (fP_1(x)+fP_2(x))}\right) ^2$$

where $P_1$ and $P_2$ are n-gram profiles of 2 documents,  and $fP_i(x)$ - length-normilized frequency of an n-gram $x$ in a profile $P_i$

In [0]:
def distance(profile1, profile2):
  """
  calculates distance between 2 document profiles

  :param profile1: collections.Counter, token frequencies in profile 1
  :param profile2: collections.Counter, token frequencies in profile 2
  :return: float, distance
  """
  vocab = set(profile1.keys()).union(set(profile2.keys()))
  diffs = []
  for word in vocab:
    fp1 = profile1[word]
    fp2 = profile2[word]
    diff = ((fp1 - fp2)/((fp1 + fp2)/2)) ** 2
    diffs.append(diff)
  return np.sum(diffs)

### Implement the radius distance form the article:
$$r(d_i, u, A) = \frac{D(d_i, u)}{D^{max}(d_i, A)}$$
where $D^{max}(d_i, A)$ is $max_{j=1}^{m=|A|}(d_i, A_j)$, or the maximal distance between a known-author document $d_i$ and all other documents from known-author collection $A$

In [0]:
def radius(di, u, A):
  """
  compares proximity between an unknown document and a known one
  :param di: collections.Counter, token frequencies in a known document i
  :param u: collections.Counter, token frequencies in an unknown document
  :param A: list of collections.Counter, token frequencies of all know documents
  :return: float, distance
  """
  return(distance(di, u)/np.max([distance(di, ai) for ai in A]))

In [0]:
def radius_distance(u, A):
  """
  compares proximity between an unknown document and all known ones
  :param u: collections.Counter, token frequencies in an unknown document
  :param A: list of collections.Counter, token frequencies of all know documents
  :return: float, distance
  """
  dists = []
  for di in A:
    dists.append(radius(di, u, A))
  return np.mean(dists)

## Compile a singe classifier

In [0]:
class SingleClassifier():
    """
    Classifies a texts as belonging to several classes given true classes

    :param profiles: list of collection.Counter, n-gram profiles
    :param y_true: list of str or int, true classes of each text from profiles
    :param p_length: int, length of truncated profile
    """
    def __init__(self, profiles, y_true, p_length):
        self.y_true = y_true
        self.p_length = p_length  
        self.profiles = self.truncate(profiles, p_length)
    
    @staticmethod
    def truncate(profiles, p_length):
        """
        truncates a profile to a given length specified by p_length

        :param profiles: list of collection.Counter, n-gram profiles
        :param p_length: int, length of truncated profile, 
                         if more than actual profile length, remains unchanged
        :return: list of collection.Counter, text n-gram profiles truncated to
                 the length of p_length
        """
        profiles = [profile.most_common(p_length) for profile in profiles]
        profiles = [make_counter(profile) for profile in profiles]
        return profiles
    
    def forward_one(self, x):  
        """
        predicts the class of a given n-gram profile x

        :param x: collection.Counter, n-gram profile to predict the class of
        :return y: str or int, predicted class
        :return class_dist: dict of {str or int: float}, proximity to each class
        """
        classes = np.unique(np.array(self.y_true))
        class_dist = {}
        for c in classes:
            y_ids = np.where(np.array(self.y_true)==c)  # Выбрать все мемберы этого класса кроме х
            y_c = np.array(self.profiles)[y_ids]
            
            if x in y_c:
                y_c = np.delete(y_c, np.argwhere(y_c==x))
            distance = radius_distance(x, y_c)  
            class_dist[c] = distance
        
        return min(class_dist, key=class_dist.get), class_dist

    def forward_all(self):
        """
        predicts the class of each profile form profiles

        :return: list of tuples (y, class_dict)
        :type y: str or int, predicted class
        :type class_dist: dict of {str or int: float}, proximity to each class
        """
        return [self.forward_one(x) for x in self.profiles]

### Test the single classifier

#### Preprocessing for simulated data

In [0]:
def preprocess(text):
  return(Counter(text.split()))

def make_counter(commons):
    return Counter({x[0]: x[1] for x in commons})

#### Simulate data

In [0]:
texts = ['I am so sad I am so tired', 'I am so useless and crying and tired so tired so tired', 
         'I am so sad so sad so sad', 'I am so sad and so tired',
         
         'Wow a velocaraptor', 'She is no longer a veloceraptor',
         'I wish we could be a velocaraptor', 'velocaraptor veloceraptor veloceraptor']

ys = ['sad', 'sad', 'sad', 'sad', 'dino', 'dino', 'dino', 'dino']

profs = [preprocess(text) for text in texts]

#### Test

In [8]:
cl = SingleClassifier(profs, ys, 10)
cl.forward_one(preprocess(texts[0]))

('sad', {'dino': 1.0155122655122655, 'sad': 0.5214161917552966})

In [9]:
cl.forward_all()

[('sad', {'dino': 1.0155122655122655, 'sad': 0.5214161917552966}),
 ('dino', {'dino': 1.2467532467532467, 'sad': 2.2174809624122838}),
 ('sad', {'dino': 0.8961038961038962, 'sad': 0.7055166523861863}),
 ('sad', {'dino': 1.1298701298701297, 'sad': 0.4696971554169289}),
 ('dino', {'dino': 0.5367965367965367, 'sad': 1.9547890946953117}),
 ('dino', {'dino': 1.2037037037037037, 'sad': 2.6415204838078177}),
 ('dino', {'dino': 1.1913419913419914, 'sad': 2.418766139952783}),
 ('dino', {'dino': 0.5401635401635402, 'sad': 1.7258786316578099})]

#### Compute performance

In [18]:
y_pred = [x[0] for x in cl.forward_all()]
y_pred

['sad', 'dino', 'sad', 'sad', 'dino', 'dino', 'dino', 'dino']

In [17]:
cl.y_true

['sad', 'sad', 'sad', 'sad', 'dino', 'dino', 'dino', 'dino']

accuracy

In [14]:
accuracy_score(cl.y_true, y_pred)

0.875

precision

In [20]:
precision_score(cl.y_true, y_pred, pos_label='dino')

0.8

In [21]:
precision_score(cl.y_true, y_pred, pos_label='sad')

1.0

recall

In [24]:
recall_score(cl.y_true, y_pred, pos_label='dino')

1.0

In [23]:
recall_score(cl.y_true, y_pred, pos_label='sad')

0.75

$f_1$score

In [26]:
f1_score(cl.y_true, y_pred, pos_label='sad')

0.8571428571428571

In [27]:
f1_score(cl.y_true, y_pred, pos_label='dino')

0.888888888888889

ROC AUC score

In [0]:
def softmax(scores, inverse=True):
  """
  normilizes class scores, making them correspond to probability, adding up to 1

  :param scores: list of int or float, scores
  :inverse: bool, if lower scores indicate higher probability,
            optional, default True
  :return: list of int or float, normilized probability-like scores
  """
  if inverse:
    scores = [-1*score for score in scores]
  normalizing_constant = np.sum(scores)
  normalized_scores = [score/normalizing_constant for score in scores]
  return normalized_scores

In [0]:
y_score = [[el[1] for el in sorted(x[1].items())] for x in cl.forward_all()]  # get scores from dict
y_score = [softmax(scores) for scores in y_score]  # normilize scores

In [60]:
y_score

[[0.6607414032255609, 0.3392585967744391],
 [0.359892885837406, 0.640107114162594],
 [0.5594982512859819, 0.4405017487140182],
 [0.7063598638599082, 0.29364013614009177],
 [0.2154437439403306, 0.7845562560596694],
 [0.3130386279200781, 0.6869613720799219],
 [0.3300017473201595, 0.6699982526798405],
 [0.23837311894746394, 0.7616268810525361]]

In [53]:
cl.y_true

['sad', 'sad', 'sad', 'sad', 'dino', 'dino', 'dino', 'dino']

In [58]:
y_true = [[1, 0] if label == 'sad' else [0, 1] for label in cl.y_true]
y_true

[[1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1]]

In [59]:
roc_auc_score(y_true, y_score, multi_class='ovo')

1.0

In [61]:
roc_auc_score(y_true, y_score, multi_class='ovr')

1.0

### **TODO: Test on real data**

## Comile an ensemble classifier 

Создаем ансамбль-класс. Он состоит из многих сингл классифаеров. Они отличаются тем, что мы подаем в них разные профайлы (длина энграмм, тип энграмм). Кроме того, у них разные гиперпараметры. Мы создаем Singleclassifier на каждое возможное сочетание гиперпараметров. Пространство гиперпараметров уже определено. 

Options space:
- size of N-grams (n)
    - from 3 to 10 for characters
    - from 1 to 3 for words
- size of a profile Number of n-grams (L) 200, 500, 1000, 1500, 2000, 2500, 3000. 

Каждый классифаер предиктит лейбл. Мы берем по большинству. Для весов берем средний вес по каждому классу.  