# Edit Distance


In [1]:
import numpy as np
from spacy.language import Language
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
from random import randrange

# edit distance function

An optional step in creating an edit distance function is creating a delta function that will compare characters and then passing it as a parameter into the edit distance function. For the most basic algorithm we create a function that only return 0 when characters are equal which means that their distance equals 0. Otherwise it equals 1

In [2]:
# delta function for  difference between two characters
def delta(a, b):
    if a == b:
        return 0
    else:
        return 1

Then we create the edit distance function that will compare two strings T1 and T2 

In [3]:
# function for finding edit distance between two strings (T1 and T2) with given delta function (fdelta)
def edit_distance(T1, T2, fdelta):

    # initialize the result table with functions from the numpy library
    RT = np.zeros((len(T1) + 1, len(T2) + 1))
    # arr1 and arr2 are vectors that help to initialize first column and line
    arr1 = np.linspace((0, 0), (len(T1), 0), len(T1) + 1)
    arr2 = np.linspace((0, 0), (len(T2), 0), len(T2) + 1)
    RT[:, 0], RT[0, :] = arr1[:, 0], arr2[:, 0]

    # we decrement each value in arr1 by one in order to initialize the parents table
    # parents table will tell us what position in the table was the one we reached the current position result from
    arr1[:, 0] -= np.ones(len(T1) + 1)
    arr2[:, 0] -= np.ones(len(T2) + 1)
    parents = np.zeros((len(T1) + 1, len(T2) + 1, 2))
    parents[:, 0], parents[0, :] = arr1[:], arr2[:]
    parents[0, 0] = np.array([0, 0])

    # we fill up the result table and parents table considering the knowledge we stored in previous column and line
    for i in range(len(T1)):
        for j in range(len(T2)):
            a, b, c = (
                RT[i, j + 1] + 1,
                RT[i + 1, j] + 1,
                RT[i, j] + fdelta(T1[i], T2[j]),
            )

            # case in which the characers are the same or we have to change a character to some other
            if c <= a and c <= b:
                RT[i + 1, j + 1] = c
                parents[i + 1, j + 1] = np.array([i, j])
            # case in which we remove the character
            elif b < a:
                RT[i + 1, j + 1] = b
                parents[i + 1, j + 1] = np.array([i + 1, j])
            # case in which we add a character
            else:
                RT[i + 1, j + 1] = a
                parents[i + 1, j + 1] = np.array([i, j + 1])

    # we return the result table and parents table for future visualisation
    return RT, parents

# visualisation of edit distance function

To visualize changes that have to be applied to one string that will hange it into other we'll need a visualisation function.
The funcion will take two strings and delta function as an input and calculate the result and parents table with previously implemented edit distance function

In [4]:
# result visualisation function
def visualise(T1, T2, fdelta):
    # we calculate result table and parents table
    RT, parents = edit_distance(T1, T2, fdelta)

    # we recreate the path path in which we can reach T2 from T1.
    # we start in the final position in the parents table and go backwards until we reach the (0,0) position
    # we store our path in the 'result' array and then reverse it so that our path starts at the beginning of the string
    i, j = len(T1), len(T2)
    result = []
    while i != 0 or j != 0:
        result.append((i, j))
        p = parents[i, j]
        i, j = int(p[0]), int(p[1])
    result.append((0, 0))
    result.reverse()

    # we copy T1 in order to modify it
    string = T1
    # we'll need to know which position we came from at every moment so we store that information
    prev = result[0]
    print("source", string)

    # in a loop we'll iterate through the calculated path printing correct operations alongside modified string
    # the length of a string might change thus we'll need an offset variable. Otherwise we'll use wrong list indexes
    # we'll decrement the offset variable when deleting a character from 'string' and increment it when adding
    # we'll then use the offset variable to calculate the proper index of the letter in the string
    offset = 0
    for i in range(1, len(result)):

        # we copy the result for better clarity
        a, b = result[i][0], result[i][1]

        # case in which we change the letter
        if a == prev[0] + 1 and b == prev[1] + 1:
            # we only need to print if the letters differ
            if T1[a - 1] != T2[b - 1]:
                string = string[: a - 1 + offset] + T2[b - 1] + string[a + offset :]
                print("change", string)
            else:
                pass

        # case in which we delete the letter
        elif a == prev[0] + 1:
            offset -= 1  # updating offset
            string = string[: a + offset] + string[a + 1 + offset :]
            print("delete", string)

        # case in which we add a leter
        else:
            offset += 1  # updating offset
            string = string[: a - 1 + offset] + T2[b - 1] + string[a - 1 + offset :]
            print("insert", string)
        prev = result[i]  # updating the previous position to the current one

    # we return the edit distance for printing
    return RT[len(T1), len(T2)]

In [5]:
print(visualise("los", "kloc", delta))

source los
insert klos
change kloc
2.0


In [6]:
print(visualise("Łódź", "Lodz", delta))

source Łódź
change Lódź
change Lodź
change Lodz
3.0


In [7]:
print(visualise("kwintesencja", "quintessence", delta))

source kwintesencja
change qwintesencja
change quintesencja
insert quintessencja
delete quintessenca
change quintessence
5.0


In [8]:
print(visualise("ATGAATCTTACCGCCTCG", "ATGAGGCTCTGGCCCCTG", delta))

source ATGAATCTTACCGCCTCG
change ATGAGTCTTACCGCCTCG
change ATGAGGCTTACCGCCTCG
insert ATGAGGCTCTACCGCCTCG
change ATGAGGCTCTGCCGCCTCG
change ATGAGGCTCTGGCGCCTCG
change ATGAGGCTCTGGCCCCTCG
delete ATGAGGCTCTGGCCCCTG
7.0


In [9]:
print(visualise("cbabac", "abcabba", delta))

source cbabac
change ababac
insert abcabac
change abcabbc
change abcabba
4.0


# calculating the longest common subsequence

To calculate the longest common subsequence we can use the edit distance algorithm but somehow we'll need to ignore the case in which we change the letter into a different one. While looking for the longest common subsequence we can't change caracters. Instead we localize the positions of characters that appear in one string and not in the other one and vice versa. In our algorithm these letters will be the ones that will be considered deleted or inserted. We can do that by simply creating a proper delta function that will never allow the algorithm to change letters

In [10]:
# the delta function will return infinity when the letters are different otherwise it'll return 0.
# this will never allow the algorithm to change letters.
# it will only consider the steps that would result in insertion and deletion
def delta2(a, b):
    if a == b:
        return 0
    else:
        return float("inf")  # or 2

Then we'll simply use the edit distance function with our new delta function and use the known formula to calculate the length of the longest common subsequence

In [11]:
# find the longest common subsequence
def lcs(T1, T2):
    RT, parents = edit_distance(T1, T2, delta2)

    # we return the result of calculated formula but also the result and parents table for future reference and printing
    return (len(T1) + len(T2) - RT[-1, -1]) / 2, RT, parents

In [12]:
print(lcs("cbabac", "abcabba")[0])

4.0


# file tokenization

To tokenize the file we'll use the spacy's tokenizer tool. We'll save it in two new files. We'll then use the longest common subsequence function to calculate the number of common tokens

In [13]:
# reading the file
file = open("romeo-i-julia-700.txt", "r", encoding="utf-8")
text = file.read()

# tokenizing the text with a tokenizer
vocab = Language(Vocab()).vocab
tokenizer = Tokenizer(vocab)
tokens = tokenizer(text)

# creating 2 versions of the original text with the 3% of tokens removed and storing them in files
T1, T2 = [], []
for token in tokens:
    if randrange(100) >= 3:
        T1.append(token)
    if randrange(100) >= 3:
        T2.append(token)

# writing to files
f1 = open("text1.txt", "w", encoding="utf-8")
f2 = open("text2.txt", "w", encoding="utf-8")
for token in T1:
    f1.write(token.text_with_ws)
for token in T2:
    f2.write(token.text_with_ws)


# printing token statistics
print("number of original tokens  :", len(tokens))
print("number of tokens in text 1 :", len(T1))
print("number of tokens in text 2 :", len(T2))
d, RT, parents = lcs(T1, T2)
print("number of   common tokens  :", int(d))


file.close()
f1.close()
f2.close()

number of original tokens  : 2272
number of tokens in text 1 : 2198
number of tokens in text 2 : 2209
number of   common tokens  : 2139


# diff function

To build a diff tool we'll use the longest common subsequence function. We'll be passin two arrays of strings: T1 and T2. Fortunately python allows us to compare strings easily. We'll use similar thought process that we used in visualisation function for the edit distance algorithm

In [14]:
def diff(T1, T2):
    # we first run the longes common subsequence function
    d, RT, parents = lcs(T1, T2)

    # then we recreate the path (similarly to the visualisation function)
    i, j = len(T1), len(T2)
    result = []
    while i != 0 or j != 0:
        result.append((i, j))
        p = parents[i, j]
        i, j = int(p[0]), int(p[1])
    result.append((0, 0))
    result.reverse()

    # we then print each string that appears in T1 and doesn't in T2 and vice versa
    prev = result[0]
    for i in range(1, len(result)):
        a, b = result[i][0], result[i][1]
        if (
            a == prev[0] + 1 and b == prev[1] + 1
        ):  # we don't want to print identical lines
            pass
        elif a == prev[0] + 1:
            print(
                f"< {a-1} {T1[a-1]}",
            )  # or a-1 ???
        else:
            print(f"> {b-1} {T2[b-1]}")
        prev = result[i]

To use the diff tool on our files we'll simply open them and use split function to separate each text in places where '\n' character appears and then run the diff function

In [15]:
f1 = open("text1.txt", "r", encoding="utf-8")
f2 = open("text2.txt", "r", encoding="utf-8")
T1 = f1.read().split("\n")
T2 = f2.read().split("\n")

diff(T1, T2)

f1.close()
f2.close()

> 0 Shakespeare
< 0 William Shakespeare
> 19  * JAN — brat z tegoż zgromadzenia
< 19  * JAN z tegoż zgromadzenia
> 22  * ABRAHAM — służący 
> 23  * 
< 22  * ABRAHAM — służący Montekiego
< 23  * APTEKARZ
> 28  * PANI MONTEKI — małżonka Montekiego
> 29  * PANI KAPULET — małżonka Kapuleta
< 28  * PANI MONTEKI — małżonka Montekiego* PANI KAPULET — małżonka Kapuleta
> 32  * Obywatele weroneńscy, różne płci obojej, liczący się do obu domów, maski, straż wojskowa i inne osoby.
< 31  * Obywatele weroneńscy, różne osoby płci obojej, liczący się do przyjaciół obu domów, maski, straż wojskowa i inne osoby.
> 37 Rzecz odbywa się przez większą część sztuki Weronie, przez część piątego aktu w Mantui.
< 36 Rzecz odbywa się przez większą część sztuki w Weronie, przez część piątego aktu w Mantui.
> 45 Dwa rody, zacne jednako i sławne —
> 46 Tam, gdzie się rzecz ta rozgrywa, w Weronie,
> 47 Do nowej zbrodni pchają złości dawne,
< 44 zacne jednako i sławne —
< 45 Tam, gdzie się rzecz ta rozgrywa, w Weron