<a href="https://colab.research.google.com/github/ovbystrova/Interference/blob/master/Distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Author Verification Using Common N-Gram Profiles of Text Documents](https://www.aclweb.org/anthology/C14-1038.pdf)


In [0]:
import numpy as np

from collections import Counter

In [0]:
def preprocess(text):
  return(Counter(text.split()))

In [0]:
text0 = 'I am so sad I am so tired'
text1 = 'I am so tired so tired so tired'
text2 = 'I am so sad so sad so sad'
text3 = 'I am so sad and so tired'

text4 = 'I am a veloceraptor'
text5 = 'I am no loonger a veloceraptor'
text6 = 'I wish I could be a veloceraptor'
text7 = 'I am a huge veloceraptor'

In [0]:
def distance(profile1, profile2):
  """
  calculates distance between 2 document profiles

  :param profile1: collections.Counter, token frequencies in profile 1
  :param profile2: collections.Counter, token frequencies in profile 2
  :return: float, distance
  """
  vocab = set(profile1.keys()).union(set(profile2.keys()))
  diffs = []
  for word in vocab:
    fp1 = profile1[word]
    fp2 = profile2[word]
    diff = ((fp1 - fp2)/((fp1 + fp2)/2)) ** 2
    diffs.append(diff)
  return np.sum(diffs)

In [24]:
distance(preprocess(text1), preprocess(text2))

8.0

In [0]:
def radius(di, u, A):
  """
  compares proximity between an unknown document and a known one
  :param di: collections.Counter, token frequencies in a known document i
  :param u: collections.Counter, token frequencies in an unknown document
  :param A: list of collections.Counter, token frequencies of all know documents
  :return: float, distance
  """
  return(distance(di, u)/np.max([distance(di, ai) for ai in A]))

In [31]:
radius(preprocess(text0), preprocess(text5), [preprocess(text) for text in [text0, text1, text2, text3]])

4.775900073475386

In [0]:
def radius_distance(u, A):
  """
  compares proximity between an unknown document and all known ones
  :param u: collections.Counter, token frequencies in an unknown document
  :param A: list of collections.Counter, token frequencies of all know documents
  :return: float, distance
  """
  dists = []
  for di in A:
    dists.append(radius(di, u, A))
  return np.mean(dists)

let us try dissimilar documents

In [35]:
radius_distance(preprocess(text5), [preprocess(text) for text in [text0, text1, text2, text3]])

3.377381131905964

In [41]:
radius_distance(preprocess(text3), [preprocess(text) for text in [text5, text6, text7]])

1.4444005270092228

let us try similar documents

In [37]:
radius_distance(preprocess(text0), [preprocess(text) for text in [text1, text2, text3]])

0.6181465308102864

In [39]:
radius_distance(preprocess(text4), [preprocess(text) for text in [text5, text6, text7]])

0.39855072463768115