# Test of DiffVis when tokenizing

## Settings

In [1]:
import sys
import MeCab
from IPython.display import HTML

In [2]:
sys.path.append('../../')
from DiffVis.diffvis import DiffVis

In [3]:
sys.setrecursionlimit(2000)

## Define

In [4]:
class Tokenizer(object):
    """Tokenize Japanese sentence to words.

    Args:
        mecab_dict_path (str or pathlib.Path): Path to MeCab dictionary.
        parts_of_speech (list[str]): List that contains part of speeches to be extracted.
            If None (default), these are used:
                [
                    '名詞', '動詞', '形容詞', '副詞', '助詞',
                    '接続詞', '助動詞', '連体詞', '感動詞',
                ]
        normalize (bool): Flag to determine whether to transform to base form or not.
            Defaults to False.

    Attributes:
        tagger (MeCab.Tagger): Tagger.
        parts_of_speech (list[str]): List that contains part of speeches to be extracted.
        normalize (bool): Flag to determine whether to transform to base form or not.
    """
    def __init__(self, mecab_dict_path='', parts_of_speech=None, normalize=False,):
        added = ''
        if mecab_dict_path:
            added = ' -d' + str(mecab_dict_path)
        self.tagger = MeCab.Tagger('-Ochasen' + added)
        self.tagger.parse('')
        if parts_of_speech:
            self.parts_of_speech = parts_of_speech
        else:
            self.parts_of_speech = [
                '名詞', '動詞', '形容詞', '副詞', '助詞',
                '接続詞', '助動詞', '連体詞', '感動詞',
                ]
        self.normalize = normalize

    def tokenize(self, sentence):
        """Tokenize Japanese sentence to words.

        Args:
            sentence (str): Sentence to tokenize.

        Returns:
            words (list[str]): Extracted words.
        """
        res = self.tagger.parseToNode(sentence)
        words = []
        if self.normalize:
            while res:
                content = res.feature.split(',')
                part_of_speech = content[0]
                if part_of_speech in self.parts_of_speech:
                    word = content[6]
                    if word == '*':
                        word = res.surface
                    words.append(word)
                res = res.next
        else:
            while res:
                word = res.surface
                part_of_speech = res.feature.split(',')[0]
                if part_of_speech in self.parts_of_speech:
                    words.append(word)
                res = res.next
        return words

## Test

In [5]:
alignment = 'LCS'  # Levenshtein or LCS
source = 'すももも桃も桃のうち'
target = 'すもももももももものうち'

In [6]:
print('Source (Length: {}): {}'.format(len(source), source))
print('Target (Length: {}): {}'.format(len(target), target))

Source (Length: 10): すももも桃も桃のうち
Target (Length: 12): すもももももももものうち


In [7]:
tokenizer = Tokenizer()

In [8]:
source_tokenized = tokenizer.tokenize(source)
target_tokenized = tokenizer.tokenize(target)
print('Source (Length: {}): {}'.format(len(source_tokenized), source_tokenized))
print('Target (Length: {}): {}'.format(len(target_tokenized), target_tokenized))

Source (Length: 10): ['すもも', 'も', '桃', 'も', '桃', 'の', 'うち']
Target (Length: 12): ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']


In [9]:
dv = DiffVis(source_tokenized, target_tokenized, alignment=alignment)
dv.build()

In [10]:
print('Distance: {}'.format(dv.distance(normalize=False)))
print('Normalized Distance: {}'.format(dv.distance(normalize=True)))

Distance: 2
Normalized Distance: 0.2857142857142857


In [11]:
dv.cost_table

((0, 0, 0, 0, 0, 0, 0, 0),
 (0, 1, 1, 1, 1, 1, 1, 1),
 (0, 1, 2, 2, 2, 2, 2, 2),
 (0, 1, 2, 2, 2, 2, 2, 2),
 (0, 1, 2, 2, 3, 3, 3, 3),
 (0, 1, 2, 2, 3, 3, 3, 3),
 (0, 1, 2, 2, 3, 3, 4, 4),
 (0, 1, 2, 2, 3, 3, 4, 5))

In [12]:
print(dv.format_edit_history())

Edit History
	match
	match
	replace
	match
	replace
	match
	match


In [13]:
print(dv.visualize(mode='console', padding=True))

[32mすもも[0m[32mも[0m[31m桃　[0m[32mも[0m[31m桃　[0m[32mの[0m[32mうち[0m
[32mすもも[0m[32mも[0m[34mもも[0m[32mも[0m[34mもも[0m[32mの[0m[32mうち[0m


In [14]:
print(dv.visualize(mode='console', padding=False))

[32mすもも[0m[32mも[0m[31m桃[0m[32mも[0m[31m桃[0m[32mの[0m[32mうち[0m
[32mすもも[0m[32mも[0m[34mもも[0m[32mも[0m[34mもも[0m[32mの[0m[32mうち[0m


In [15]:
HTML(dv.visualize(mode='html', padding=True))

In [16]:
HTML(dv.visualize(mode='html', padding=False))

In [17]:
HTML(dv.visualize(mode='htmltab'))

0,1,2,3,4,5,6
すもも,も,桃,も,桃,の,うち
すもも,も,もも,も,もも,の,うち


In [18]:
dv.make_template(return_str=True)

'すももも<blank>も<blank>のうち'

In [19]:
dv.template

['すもも', 'も', '<blank>', 'も', '<blank>', 'の', 'うち']