In [1]:
from transformers import GPT2TokenizerFast, GPT2Tokenizer
from copy import deepcopy
import numpy as np
from random import random
from time import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

from data_loader import get_imput
from embedding_mask import create_mask_embedding, sum_masks, process_token, encode_em


In [3]:
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
vocab = tokenizer.get_vocab()
vocabi = {v:k for k, v in vocab.items()}



In [45]:

def tokenize_atoms(txt, tokenizer):
    atxt = "".join(tokenizer.tokenize(txt))
    atxt = list(atxt)
    eatxt = tokenizer.encode(atxt)
    return eatxt

def past_tokens_gather(atokens, tokenizer):
    past_emb_pos = []
    for i in range(1, len(atokens)+1):
        partial = atokens[:i]
        pdec = tokenizer.decode(partial)
        if '�' in pdec:
            continue
        else:
            penc = tokenizer.encode(pdec)
        past_emb_pos.append(penc)
    return past_emb_pos

def create_emb_pos_mask(emb_pos):
    res = []
    mask = np.eye(len(emb_pos)).astype(bool)
    for embt, m in zip(emb_pos, mask):
        res.append(embt[-1])
        for ie in embt[:-1]:
            is_added = True if res[:-1] == [] else False
            for j, je in enumerate(res[:-1]):
                if ie == je:
                    m[j] = True
                    is_added = True
                    break
            if not is_added:
                raise Exception("target len != input len")
    return np.array(res)[:,0], np.array(res)[:,1], mask

def tokenizex_encode(txt, tokenizer):
    tenc = tokenize_atoms(txt, tokenizer)
    res = past_tokens_gather(tenc, tokenizer)
    g = [list(zip(e, np.arange(len(e)))) for e in res]
    e, p, m = create_emb_pos_mask(g)
    return e, p, m


In [46]:
txt = " ż"

In [49]:
tenc = tokenize_atoms(txt, tokenizer)
print(tenc)
for i in range(1, len(tenc)+1):
    partial = tenc[:i]
    pdec = tokenizer.decode(partial)
    if '�' in pdec:
        continue
    else:
        print(f"partial: {partial}")
        print(f"pdec: ;{pdec};")
        penc = tokenizer.encode(pdec)
    print(f"penc: {penc}")
    print()


[220, 129, 120]
partial: [220]
pdec: ; ;
penc: [220]

partial: [220, 129, 120]
pdec: ; ż;
penc: [25370, 120]



In [44]:
tokenizer.decode([25370, 120])

' ż'

In [33]:
tenc = tokenize_atoms(txt, tokenizer)
res = past_tokens_gather(tenc, tokenizer)
g = [list(zip(e, np.arange(len(e)))) for e in res]
e, p, m = create_emb_pos_mask(g)


Exception: target len == input len

In [20]:
e, p, m = tokenizex_encode(txt, tokenizer)
tokenizer.decode(e[m[-1]])

AssertionError: 

In [7]:
at = tokenize_atoms(txt, tokenizer)
at

[220, 127, 111]

In [8]:
past_tokens_gather(at, tokenizer)

[[220], [20543], [6184, 111]]

In [9]:
tokenizer.tokenize(" ó")

['ĠÃ', '³']

In [10]:
tokenizer.decode([220, 127, 111])

' ó'

In [12]:
res

NameError: name 'res' is not defined

In [None]:
l = []
for r in m:
    l.append(e[r.astype(bool)].tolist())
l == res


In [None]:

def time_valid_tokenization(txt_data, tokenizer, def_tokenizer, C, lbl_val = False):
    st = 0
    for i in range(C):
        txti = def_tokenizer.decode(def_tokenizer.encode(get_imput(txt_data, 512)))
        t1 = time()
        emb, pos, mask = tokenizer(txti)
        st += time()-t1

        if lbl_val: 
            lemb = tokenize_atoms(txti, def_tokenizer)
            if len(lemb) != len(emb):
                print(txti)
            assert len(lemb) == len(emb)

            comp = pos[mask[-1].astype(bool)]
            for a, b in zip(comp.astype(int), list(range(len(comp)))):
                assert a == b

    return st


In [None]:
def validate(txt, emb, pos, mask, tokenizer):
    dec = tokenizer.decode(emb[mask[-1].astype(bool)])
    assert dec == txt

    lemb = tokenize_atoms(txt, tokenizer)
    assert len(lemb) == len(emb)

    comp = pos[mask[-1].astype(bool)]
    for a, b in zip(comp.astype(int), list(range(len(comp)))):
        assert a == b

In [None]:
txt = """ ósiem"""
emb, pos, mask = encode_em(txt, tokenizer)
validate(txt, emb, pos, mask, tokenizer)

In [None]:
txt = " ósiem"

tokens = tokenizer.tokenize(txt)
res_emb = []
res_pos = []
partial_masks = []
pos_len = 0

for t in tokens:
    emb, pos, mask = process_token(t, tokenizer)

    res_emb.extend(emb)
    res_pos.extend(pos+pos_len)
    pos_len += pos[-1] + 1
    partial_masks.append(mask)



In [None]:
t1  = "elo źle żal ósemka no i śląska"
t3 = "example text to tokenize and, żargon ołówka źle wpływa detokenize, ślóza."
t1  = "elo źle żaló"
t3 = "example."
e2  = np.array([69])
p2  = np.array([0])
m2  = np.array([1])
t2 = tokenizer.decode(e2)

e1, p1, m1 = encode_em(t1, tokenizer)
e3, p3, m3 = encode_em(t3, tokenizer)
txt = t1+t2+t3
txt

emb = np.concatenate([e1,e2,e3])
pos = sum_pos([p1,p2,p3])
mlen = m1.shape[0]+m2.shape[0]+m3.shape[0]
mask = sum_masks(mlen, [m1, m2, m3])


In [None]:
# faulty examples
' ósiem'
"ó"
""" ›"""
''

In [None]:

def text_emb_pos(txt, tokenizer):
    emb_pos = []

    letter_split = False
    for l in txt:
        t = tokenizer.encode(l)
        if len(t) > 1:
            letter_split = True
    
    if letter_split:
        for l in txt:
            et = tokenizer.encode(l)
            emb_pos.append(list(zip(et, list(range(len(et))))))
    else:
        for i in range(1, len(txt)+1):
            et = tokenizer.encode(txt[:i])
            emb_pos.append(list(zip(et, list(range(len(et))))))

    return emb_pos

In [None]:
for t in atomizer.keys():
    txt = tokenizer.decode(t)
    emb = tokenize_letters(txt, tokenizer)
    a_emb = atomizer[t][0]
    assert len(emb) == len(a_emb)
    if len(emb) != len(a_emb):
        print(len(emb), len(a_emb))

In [None]:
def tokenize_letters(txt, tokenizer):
    emb = []
    for l in txt:
        emb.extend(tokenizer.encode(l))
    return emb


In [None]:
def create_atomizer(tokenizer):
    """
    {
        "token_id":{
            "embedding":np.array(int), 
            "pe":np.array(int), 
            "mask":np.array(np.array(union(0, 1)))
        }
    }
    """
    tokenizex_atomizer = {}
    vocab = {v:k for k, v in tokenizer.get_vocab().items()}
    all_tok = vocab.keys()

    for t in all_tok:
        e, p, m = process_token(t, tokenizer)
        tokenizex_atomizer[t] = (e, p, m)
        
    return tokenizex_atomizer
        
def embedding_mask_atomizer(txt, tokenizer, tokenizex_atomizer):
    tokens = tokenizer.encode(txt)
    partial_masks = []

    res_emb = []
    res_pos = []
    pos_len = 0
    
    for t in tokens:
        emb, pos, mask = tokenizex_atomizer[t]
        res_emb.extend(emb)
        res_pos.extend(pos+pos_len)
        pos_len += pos[-1] + 1
        partial_masks.append(mask)   
    
    full_mask = sum_masks(len(res_emb), partial_masks)
    return np.array(res_emb), np.array(res_pos), full_mask
