## Odległość edycyjna

In [2]:
import numpy as np

### 1. Algorytm:

In [3]:
def delta(char1, char2):
    return 0 if char1 == char2 else 1
    

In [4]:

def distance(word1, word2, delta=delta):
    n1, n2 = len(word1) + 1, len(word2) + 1

    dist = np.empty((n1, n2), dtype=int)
    op = np.zeros_like(dist)
    dist[:, 0] = np.arange(n1)
    dist[0, :] = np.arange(n2)

    for i in range(1, n1):
        for j in range(1, n2):
            dist[i, j], op[i, j] = min(
                (dist[i - 1, j] + 1, 1),
                (dist[i, j - 1] + 1, 2),
                (dist[i - 1, j - 1] + delta(word1[i - 1], word2[j - 1]), 3)
            )
    
    return dist, op


In [5]:
def get_sequence(dist, op):
    result = []
    i, j = dist.shape
    i -= 1
    j -= 1
    while(i > 0 and j > 0):
        operation = op[i, j]
        result.append((i - 1, j - 1, operation))

        i -= operation & 1
        j -= operation >> 1 & 1
        operation = op[i, j]
    
    return reversed(result)
    

### 2. Wizualizacja:

In [6]:
def visualize(word1, word2):
    dist, op = distance(word1, word2)
    
    print(word1)
    word = ""
    hint = ""
    for i, j, operation in get_sequence(dist, op):
        if operation == 3:
            if word1[i] == word2[j]:
                word += word1[i]
                continue
            current = word2[j]
            hint = word1[i] + " -> " + word2[j]
        elif operation == 1:
            current = ""
            hint = "   - " + word1[i]
        else:
            current = word2[j]
            hint = "   + " + word2[j]
        print(word + "{" + current + "}" + word1[i+1:] + "\t\t" + hint)
        word += current
    print(word2)
    return dist[-1, -1]


### 3. Demonstracja:

In [7]:
visualize("los", "kloc")

los
lo{c}		s -> c
kloc


2

In [8]:
visualize("Łódź", "Lodz")

Łódź
{L}ódź		Ł -> L
L{o}dź		ó -> o
Lod{z}		ź -> z
Lodz


3

In [9]:
visualize("kwintesencja", "quintessence")

kwintesencja
{q}wintesencja		k -> q
q{u}intesencja		w -> u
quintes{s}encja		   + s
quintessenc{e}a		j -> e
quintessence{}		   - a
quintessence


5

In [10]:
visualize("ATGAATCTTACCGCCTCG", "ATGAGGCTCTGGCCCCTG")

ATGAATCTTACCGCCTCG
ATGA{G}TCTTACCGCCTCG		A -> G
ATGAG{G}CTTACCGCCTCG		T -> G
ATGAGGCT{C}TACCGCCTCG		   + C
ATGAGGCTCT{G}CCGCCTCG		A -> G
ATGAGGCTCTG{G}CCGCCTCG		   + G
ATGAGGCTCTGGCC{}CCTCG		   - G
ATGAGGCTCTGGCCCCT{}G		   - C
ATGAGGCTCTGGCCCCTG


7

### 4. Najdłuższy wspólny podciąg:

In [15]:
def lcs(seq1, seq2, algorithm=distance):
    sequence = []
    
    dist, op = algorithm(seq1, seq2, lambda el1, el2: delta(el1, el2) * 2)
    for i, j, op in get_sequence(dist, op):
        if op == 3:
            sequence.append(seq1[i])
    
    return sequence
    

### 5. Podział tekstu na tokeny:

In [16]:
with open("romeo-i-julia-700.txt") as file:
    text = file.read().split()
    

### 6. Usuwanie losowych tokenów:

In [20]:
a1 = np.random.randint(len(text), size=len(text) * 3 // 100)
a2 = np.random.randint(len(text), size=len(text) * 3 // 100)

text1 = [token for i, token in enumerate(text) if i not in a1]
text2 = [token for i, token in enumerate(text) if i not in a2]


### 7. Najdłuższy podciąg dla tekstów:

In [22]:
print(len(text1))
print(len(text2))

len(lcs(text1, text2))


1832
1837


1785

### 8. Diff:

In [25]:
def diff(seq1, seq2, algorithm=distance):
    sequence = []

    dist, op = algorithm(seq1, seq2, lambda el1, el2: delta(el1, el2) * 2)
    for i, j, op in get_sequence(dist, op):
        if op == 1:
            sequence.append(("<", seq1[i]))
        if op == 2:
            sequence.append((">", seq2[j]))

    return sequence


### 9. Demonstracja:

In [26]:
diff(text1, text2)

[('>', 'ESKALUS'),
 ('>', 'naczelnicy'),
 ('>', 'synowiec'),
 ('<', 'Pani'),
 ('>', '—'),
 ('>', '—'),
 ('>', 'Kapuleta'),
 ('>', 'ABRAHAM'),
 ('<', 'przyjaciół'),
 ('<', 'przez'),
 ('>', 'w'),
 ('>', 'PROLOG'),
 ('>', 'zacne'),
 ('>', 'szlachetną'),
 ('>', 'wzięło'),
 ('<', 'cierpliwymi'),
 ('>', 'my'),
 ('<', 'błędy…'),
 ('>', 'nie'),
 ('<', 'będziem'),
 ('>', 'będziemy'),
 ('<', 'z'),
 ('>', 'zaraz,'),
 ('<', 'tylko'),
 ('<', 'miejscu.'),
 ('>', 'SAMSON'),
 ('>', 'o'),
 ('<', 'SAMSON'),
 ('>', 'miecz'),
 ('<', 'do'),
 ('>', 'oto'),
 ('>', 'SAMSON'),
 ('<', 'GRZEGORZ'),
 ('>', 'za'),
 ('>', 'im'),
 ('>', 'sobie,'),
 ('<', 'sobą,'),
 ('>', 'GRZEGORZ'),
 ('<', 'będzie.'),
 ('>', 'ukazuje'),
 ('<', 'krewnych'),
 ('>', 'o'),
 ('<', 'BENWOLIO'),
 ('<', 'parobkami?'),
 ('>', 'jak'),
 ('>', 'zwady;'),
 ('>', 'z'),
 ('>', 'PIERWSZY'),
 ('>', 'Książę'),
 ('>', 'zwierzęta,'),
 ('<', 'Co'),
 ('>', 'własnych'),
 ('>', 'purpurowym;'),
 ('>', 'wypuśćcie'),
 ('>', 'By'),
 ('<', 'wszystkich'),
 ('<'