# Test of DiffVis

## Settings

In [1]:
import functools
import html
import pathlib
from pprint import pprint
import re
import sys
import unicodedata

In [2]:
import MeCab
from IPython.display import HTML

In [3]:
sys.path.append('../')
from DiffVis.diffvis import DiffVis

## Define

In [4]:
def make_diff2blank(source, target, edit_history, blank='<blank>'):
    template = []
    i, j = 0, 0
    for operation in edit_history:
        if operation == 'match':
            template.append(source[i])
            i += 1
            j += 1
        elif operation == 'replace':
            template.append(blank)
            i += 1
            j += 1
        elif operation == 'delete':
            template.append(blank)
            i += 1
        elif operation == 'insert':
            template.append(blank)
            j += 1

    if not template:
        return template

    # delete duplicates of <blank>
    template_new = [template[0]]
    for i in range(1, len(template)):
        elem_now = template[i]
        elem_last = template[i-1]
        if (elem_now == blank) and (elem_last == blank):
            continue
        else:
            template_new.append(elem_now)
    template = template_new

    # if only blank, return empty string
    if template == [blank]:
        template = ['']
    return template

In [5]:
class Tokenizer(object):
    """Tokenize Japanese sentence to words.

    Args:
        mecab_dict_path (str or pathlib.Path): Path to MeCab dictionary.
        parts_of_speech (list[str]): List that contains part of speeches to be extracted.
            If None (default), these are used:
                [
                    '名詞', '動詞', '形容詞', '副詞', '助詞',
                    '接続詞', '助動詞', '連体詞', '感動詞',
                ]
        normalize (bool): Flag to determine whether to transform to base form or not.
            Defaults to False.

    Attributes:
        tagger (MeCab.Tagger): Tagger.
        parts_of_speech (list[str]): List that contains part of speeches to be extracted.
        normalize (bool): Flag to determine whether to transform to base form or not.
    """
    def __init__(self, mecab_dict_path='', parts_of_speech=None, normalize=False,):
        added = ''
        if mecab_dict_path:
            added = ' -d' + str(mecab_dict_path)
        self.tagger = MeCab.Tagger('-Ochasen' + added)
        self.tagger.parse('')
        if parts_of_speech:
            self.parts_of_speech = parts_of_speech
        else:
            self.parts_of_speech = [
                '名詞', '動詞', '形容詞', '副詞', '助詞',
                '接続詞', '助動詞', '連体詞', '感動詞',
                ]
        self.normalize = normalize

    def tokenize(self, sentence):
        """Tokenize Japanese sentence to words.

        Args:
            sentence (str): Sentence to tokenize.

        Returns:
            words (list[str]): Extracted words.
        """
        res = self.tagger.parseToNode(sentence)
        words = []
        if self.normalize:
            while res:
                content = res.feature.split(',')
                part_of_speech = content[0]
                if part_of_speech in self.parts_of_speech:
                    word = content[6]
                    if word == '*':
                        word = res.surface
                    words.append(word)
                res = res.next
        else:
            while res:
                word = res.surface
                part_of_speech = res.feature.split(',')[0]
                if part_of_speech in self.parts_of_speech:
                    words.append(word)
                res = res.next
        return words

## Toy Data

### Simple string

In [26]:
source = 'すももも桃も桃のうち1234'
target = 'すもももももももものうち1244'
m, n = len(source), len(target)
print(m, n)

14 16


In [27]:
dv = DiffVis(source, target)
dv.build()

In [28]:
print(dv.distance(normalize=True))
print(dv.distance(normalize=False))

0.3125
5


In [29]:
dv.cost_table

((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16),
 (1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
 (2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14),
 (3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13),
 (4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 (5, 4, 3, 2, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 (6, 5, 4, 3, 2, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
 (7, 6, 5, 4, 3, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
 (8, 7, 6, 5, 4, 3, 3, 3, 3, 4, 4, 5, 6, 7, 8, 9, 10),
 (9, 8, 7, 6, 5, 4, 4, 4, 4, 4, 5, 4, 5, 6, 7, 8, 9),
 (10, 9, 8, 7, 6, 5, 5, 5, 5, 5, 5, 5, 4, 5, 6, 7, 8),
 (11, 10, 9, 8, 7, 6, 6, 6, 6, 6, 6, 6, 5, 4, 5, 6, 7),
 (12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 7, 7, 6, 5, 4, 5, 6),
 (13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 7, 6, 5, 5, 6),
 (14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9, 8, 7, 6, 5, 5))

In [30]:
dv.edit_history

('match',
 'match',
 'match',
 'match',
 'replace',
 'match',
 'replace',
 'insert',
 'insert',
 'match',
 'match',
 'match',
 'match',
 'match',
 'replace',
 'match')

In [31]:
print(dv.generate_comparison(mode='console', padding=True))

[32mす[0m[32mも[0m[32mも[0m[32mも[0m[31m桃[0m[32mも[0m[31m桃[0m[32m[0m[32m[0m[32mの[0m[32mう[0m[32mち[0m[32m1[0m[32m2[0m[31m3[0m[32m4[0m
[32mす[0m[32mも[0m[32mも[0m[32mも[0m[34mも[0m[32mも[0m[34mも[0m[34mも[0m[34mも[0m[32mの[0m[32mう[0m[32mち[0m[32m1[0m[32m2[0m[34m4[0m[32m4[0m


In [None]:
print(dv.generate_comparison(mode='console', padding=False))

In [None]:
HTML(dv.generate_comparison(mode='html', padding=True))

In [33]:
HTML(dv.generate_comparison(mode='html', padding=False))

In [13]:
HTML(dv.generate_comparison(mode='htmltab'))

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
す,も,も,も,桃,も,桃,,,の,う,ち,1,2,3,4
す,も,も,も,も,も,も,も,も,の,う,ち,1,2,4,4


In [14]:
make_diff2blank(source, target, dv.edit_history)

['す',
 'も',
 'も',
 'も',
 '<blank>',
 'も',
 '<blank>',
 'の',
 'う',
 'ち',
 '1',
 '2',
 '<blank>',
 '4']

### Tokenize

In [34]:
tokenizer = Tokenizer()

In [35]:
source = 'すももも桃も桃のうち'
target = 'すもももももももものうち'
m, n = len(source), len(target)
print(m, n)

10 12


In [36]:
source = tokenizer.tokenize(source)
target = tokenizer.tokenize(target)
m, n = len(source), len(target)
print(m, n)

7 7


In [37]:
dv = DiffVis(source, target)
dv.build()

In [38]:
print(dv.distance(normalize=True))
print(dv.distance(normalize=False))

0.2857142857142857
2


In [39]:
dv.cost_table

((0, 1, 2, 3, 4, 5, 6, 7),
 (1, 0, 1, 2, 3, 4, 5, 6),
 (2, 1, 0, 1, 2, 3, 4, 5),
 (3, 2, 1, 1, 2, 3, 4, 5),
 (4, 3, 2, 2, 1, 2, 3, 4),
 (5, 4, 3, 3, 2, 2, 3, 4),
 (6, 5, 4, 4, 3, 3, 2, 3),
 (7, 6, 5, 5, 4, 4, 3, 2))

In [40]:
dv.edit_history

('match', 'match', 'replace', 'match', 'replace', 'match', 'match')

In [41]:
print(dv.generate_comparison(mode='console', padding=True))

[32mすもも[0m[32mも[0m[31m桃　[0m[32mも[0m[31m桃　[0m[32mの[0m[32mうち[0m
[32mすもも[0m[32mも[0m[34mもも[0m[32mも[0m[34mもも[0m[32mの[0m[32mうち[0m


In [None]:
print(dv.generate_comparison(mode='console', padding=False))

In [None]:
HTML(dv.generate_comparison(mode='html', padding=True))

In [42]:
HTML(dv.generate_comparison(mode='html', padding=False))

In [43]:
HTML(dv.generate_comparison(mode='htmltab'))

0,1,2,3,4,5,6
すもも,も,桃,も,桃,の,うち
すもも,も,もも,も,もも,の,うち


In [44]:
make_diff2blank(source, target, dv.edit_history)

['すもも', 'も', '<blank>', 'も', '<blank>', 'の', 'うち']