## Basic Idea

I want to try how a diff would look like, when BPE is used to encode the input, to work on tokens instead of chars.


In [None]:
from de.mindscan.fluentgenesis.bpe.bpe_model import BPEModel
from de.mindscan.fluentgenesis.bpe.bpe_encoder_decoder import SimpleBPEEncoder

In [None]:
# load the BPE Model description file and hyper-parameter file.
model = BPEModel("16K-full","D:\\Projects\\SinglePageApplication\\Angular\\FluentGenesis-Classifier\\src\\de\\mindscan\\fluentgenesis\\bpe\\")
model.load_hparams()

# load associated vocabulary and bpe-pairs
model_vocabulary = model.load_tokens()
model_bpe_data = model.load_bpe_pairs()
    
# we must also make use of the vocabulary and the byte-pair occuences and pass that information to the encoder.
bpe_encoder = SimpleBPEEncoder(model_vocabulary, model_bpe_data)

## Case Number 0x01, this has substitutions and insertions

* "tt" became "span"
* "text-monosoace" was "added"

In [None]:
del_line = '				<app-content-change-set-side-by-side-diff [contentChangeSet]="contentChangeSet.line_diff_data"></app-content-change-set-side-by-side-diff>'
add_line = '				<app-experimental-content-change-set-side-by-side-diff [contentChangeSet]="contentChangeSet.line_diff_data"></app-experimental-content-change-set-side-by-side-diff>'


In [None]:
bpe_del_line = bpe_encoder.encode([del_line])
bpe_add_line = bpe_encoder.encode([add_line])

In [None]:
print(bpe_del_line)
print(bpe_add_line)

## Stretch out

We want to figure out, where we have identical parts, substitutions, deletions and insertions

* insertions and deletions can be calculated by array stretching with a neutral element e.g. "0". The goal would be to have them euqal length, such that these arrays can be compared element wise.

Let's assume we have such an algorithm

In [None]:
def v1_find_relative(bpe_del_line,bpe_add_line):
    length_del = len(bpe_del_line)
    rel_find = [None]*length_del
    for i in range(0,length_del):
        element = bpe_del_line[i]
        if( element in bpe_add_line ):
            # actually it returns the first index, but actually we might want one of the later.
            index_inAdd = bpe_add_line[i:].index(element)
            rel_find[i]=index_inAdd
    print("rel_find")
    print(rel_find)
    return rel_find


def v1_stretchout(bpe_del_line, rel_find):
    start_offset = 0;
    stretched=[]
    for i in range(0,len(bpe_del_line)):
        if rel_find[i] is None:
            stretched.append(bpe_del_line[i])
        elif rel_find[i] <= start_offset:
            stretched.append(bpe_del_line[i])
        else:
            stretched.extend([0]*(rel_find[i]-start_offset))
            stretched.append(bpe_del_line[i])
            # this is actually bad in case things were moved around...
            start_offset=rel_find[i]
    return stretched


In [None]:
def stretchout(bpe_del_line, bpe_add_line):
    if(len(bpe_del_line) == len(bpe_add_line)):
        return  bpe_del_line, bpe_add_line
    elif(len(bpe_del_line) < len(bpe_add_line)):
        return  v1_stretchout(bpe_del_line, v1_find_relative(bpe_del_line, bpe_add_line)), bpe_add_line
    else:
        return  bpe_del_line, v1_stretchout(bpe_add_line, v1_find_relative(bpe_add_line, bpe_del_line))

In [None]:
bpe_del_line_stretched, bpe_add_line_stretched = stretchout(bpe_del_line, bpe_add_line)

In [None]:
print( bpe_del_line_stretched )
print( bpe_add_line_stretched )

bpe_del_line_stretched=[
    61, 3397, 2839, 1756, 539, 46, 51, 2119, 110, 625,    0,  0,    0,   0,    0, 
    10003, 124, 124, 6778, 47, 1755, 6844, 126, 126, 1794, 3397, 63]
bpe_add_line_stretched=[
    61, 3039, 2839, 1756, 539, 46, 51, 2119, 110, 625, 7645, 46, 2339, 450, 1070, 
    10003, 124, 124, 6778, 47, 1755, 6844, 126, 126, 1794, 3039, 63]

We can now compare elementwise
* Two equal elements -> no change
* del is zero and add is non zero -> insertion
* add is zero and del is non zero -> deletion
* two different values -> replacement

and output an array of equal length.

In [None]:
def bpe_syndrome_calculation(del_line:[], add_line:[]):
    syndrome = []
    if not len(del_line) == len(add_line):
        raise("can not calculate syndromes for different array lengths")
    for i in range(0,len(del_line)):
        if del_line[i] == add_line[i]:
            syndrome.append('_')
        elif del_line[i] == 0:
            syndrome.append('I')
        elif add_line[i] == 0:
            syndrome.append('D')
        else:
            syndrome.append('R')
    return syndrome

bpe_diff_syndrome = bpe_syndrome_calculation(bpe_del_line_stretched, bpe_add_line_stretched)

In [None]:
print(bpe_diff_syndrome)

In [None]:
from IPython.display import HTML
from html import escape

In [None]:
def bpe_visualize_syndrome_unified(syndrome:[], del_line:[], add_line:[] ):
    result = []
    for i in range(0,len(syndrome)):
        if(syndrome[i]=='_'):
            result.append( '<tt style="background:#eeeeff;">'+escape("".join(bpe_encoder.decode([del_line[i]])))+'</tt>' )
        elif(syndrome[i]=='D'):
            result.append( '<tt style="background:#ffc0c0;"><del>'+escape("".join(bpe_encoder.decode([del_line[i]])))+'</del></tt>' )
        elif(syndrome[i]=='I'):
            result.append( '<tt style="background:#c0ffc0;">'+escape("".join(bpe_encoder.decode([add_line[i]])))+'</tt>' )
        elif(syndrome[i]=='R'):
            result.append( '<tt style="background:#ffc0c0;"><del>'+escape("".join(bpe_encoder.decode([del_line[i]])))+'</del></tt>' )
            result.append( '<tt style="background:#c0ffc0;">'+escape("".join(bpe_encoder.decode([add_line[i]])))+'</tt>' )
            
    return "".join(result)


In [None]:
def bpe_visualize_syndrome_sidebyside(syndrome:[], del_line:[], add_line:[]):
    result_del = []
    result_add = []
    
    for i in range(0,len(syndrome)):
        if(syndrome[i]=='_'):
            result_del.append( '<tt style="background:#eeeeff;">'+escape("".join(bpe_encoder.decode([del_line[i]])))+'</tt>' )
            result_add.append( '<tt style="background:#eeeeff;">'+escape("".join(bpe_encoder.decode([del_line[i]])))+'</tt>' )
        elif(syndrome[i]=='D'):
            result_del.append( '<tt style="background:#ffa0a0;">'+escape("".join(bpe_encoder.decode([del_line[i]])))+'</tt>' )
        elif(syndrome[i]=='I'):
            result_add.append( '<tt style="background:#c0ffc0;">'+escape("".join(bpe_encoder.decode([add_line[i]])))+'</tt>' )
        elif(syndrome[i]=='R'):
            result_del.append( '<tt style="background:#ffc0c0;">'+escape("".join(bpe_encoder.decode([del_line[i]])))+'</tt>' )
            result_add.append( '<tt style="background:#c0ffc0;">'+escape("".join(bpe_encoder.decode([add_line[i]])))+'</tt>' )
    
    return "".join(result_del), "".join(result_add)

In [None]:
html_string = bpe_visualize_syndrome_unified(bpe_diff_syndrome, bpe_del_line_stretched, bpe_add_line_stretched )

display(HTML('<h4>unified diff</h4>'))
display(HTML('<pre>'+html_string+'</pre>'))

html_del, html_add = bpe_visualize_syndrome_sidebyside(bpe_diff_syndrome, bpe_del_line_stretched, bpe_add_line_stretched )

display(HTML('<h4>split diff</h4>'))
display(HTML('<pre>'+html_del+'</pre>'))
display(HTML('<pre>'+html_add+'</pre>'))