In [None]:
from transformers import GPT2TokenizerFast, GPT2Tokenizer
from copy import deepcopy
import numpy as np
from random import random
from time import time

In [None]:

from data_loader import get_imput
from embedding_mask import create_mask_embedding, sum_masks, process_token, encode_em

# def process_token(t, tokenizer):
#     td = tokenizer.decode(t)
#     if len(td)==1 or "�" in td:
#         if "�" in td:
#             vocab = tokenizer.get_vocab()
#             vocabi = {v:k for k, v in vocab.items()}
#             s_lett = vocabi[t]
#             emb = []
#             for e in s_lett:
#                 emb.append(vocab[e])
#             pos = np.arange(len(emb))
#             mask = np.tril(np.ones(len(emb)))
#         else:
#             emb = np.array([t]) 
#             pos = np.zeros((1))
#             mask = np.ones((1,1))
#     else:
#         emb, pos, mask = create_mask_embedding(td, tokenizer)
#         if len(emb) == 1:
#             raise Exception("Used!")
        
#     return emb, pos, mask

# def encode_em(txt, tokenizer):
#     tokens = tokenizer.encode(txt)

#     res_emb = []
#     res_pos = []
#     partial_masks = []
#     pos_len = 0

#     for t in tokens:
#         emb, pos, mask = process_token(t, tokenizer)

#         res_emb.extend(emb)
#         res_pos.extend(pos+pos_len)
#         pos_len += pos[-1] + 1
#         partial_masks.append(mask)

#     return np.array(res_emb), np.array(res_pos), sum_masks(len(res_emb), partial_masks)

def encode_em(txt, tokenizer):
    tokens = tokenizer.tokenize(txt)

    

    res_emb = []
    res_pos = []
    partial_masks = []
    pos_len = 0

    for t in tokens:
        emb, pos, mask = process_token(t, tokenizer)

        res_emb.extend(emb)
        res_pos.extend(pos+pos_len)
        pos_len += pos[-1] + 1
        partial_masks.append(mask)

    return np.array(res_emb), np.array(res_pos), sum_masks(len(res_emb), partial_masks)



In [None]:
def create_atomizer(tokenizer):
    """
    {
        "token_id":{
            "embedding":np.array(int), 
            "pe":np.array(int), 
            "mask":np.array(np.array(union(0, 1)))
        }
    }
    """
    tokenizex_atomizer = {}
    vocab = {v:k for k, v in tokenizer.get_vocab().items()}
    all_tok = vocab.keys()

    for t in all_tok:
        e, p, m = process_token(t, tokenizer)
        tokenizex_atomizer[t] = (e, p, m)
        
    return tokenizex_atomizer
        
def embedding_mask_atomizer(txt, tokenizer, tokenizex_atomizer):
    tokens = tokenizer.encode(txt)
    partial_masks = []

    res_emb = []
    res_pos = []
    pos_len = 0
    
    for t in tokens:
        emb, pos, mask = tokenizex_atomizer[t]
        res_emb.extend(emb)
        res_pos.extend(pos+pos_len)
        pos_len += pos[-1] + 1
        partial_masks.append(mask)   
    
    full_mask = sum_masks(len(res_emb), partial_masks)
    return np.array(res_emb), np.array(res_pos), full_mask

def time_valid_tokenization(txt_data, tokenizer, def_tokenizer, C, lbl_val = False):
    st = 0
    for i in range(C):
        txti = def_tokenizer.decode(def_tokenizer.encode(get_imput(txt_data, 512)))
        t1 = time()
        emb, pos, mask = tokenizer(txti)
        st += time()-t1

        if lbl_val: 
            lemb = tokenize_letters(txti, def_tokenizer)
            if len(lemb) != len(emb):
                print(txti)
            assert len(lemb) == len(emb)

            comp = pos[mask[-1].astype(bool)]
            for a, b in zip(comp.astype(int), list(range(len(comp)))):
                assert a == b

    return st


In [None]:
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# atomizer = create_atomizer(tokenizer)
vocab = tokenizer.get_vocab()
vocabi = {v:k for k, v in vocab.items()}

In [None]:
def sum_pos(poses):
    con = [poses[0]]
    for i in range(1, len(poses)):
        con.append(poses[i]+con[i-1][-1])
    return np.concatenate(con)


In [None]:
txt = "wikipediawolna"
create_mask_embedding(txt, tokenizer)

In [None]:

def tokenize_atoms(txt, tokenizer):
    atxt = "".join(tokenizer.tokenize(txt))
    atxt = list(atxt)
    eatxt = tokenizer.encode(atxt)
    return eatxt

In [157]:
def past_tokens_gather(atokens, tokenizer):
    past_emb_pos = []
    for i in range(1, len(atokens)+1):
        partial = atokens[:i]
        pdec = tokenizer.decode(partial)
        penc = tokenizer.encode(pdec)
        past_emb_pos.append(penc)
    return past_emb_pos

In [160]:
import numpy as np

# Tworzenie macierzy 5x5 z jedynkami na przekątnej
matrix = np.eye(5)
print(matrix)

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [219]:
def create_emb_pos_mask(emb_pos):
    res = []
    mask = np.eye(len(emb_pos))
    for embt, m in zip(emb_pos, mask):
        res.append(embt[-1])
        for ie in embt[:-1]:
            is_added = True if res[:-1] == [] else False
            for j, je in enumerate(res[:-1]):
                if ie == je:
                    m[j] = 1
                    is_added = True
                    break
            assert is_added #dev
    return np.array(res)[:,0], np.array(res)[:,1], mask

In [171]:
txt = "wikipediawolna"

tenc = tokenize_atoms(txt, tokenizer)
print(f"tenc: {tenc}")
dec = tokenizer.decode(tenc)
print(f"dec: {dec}")


tenc: [86, 72, 74, 72, 79, 68, 67, 72, 64, 86, 78, 75, 77, 64]
dec: wikipediawolna


In [221]:
res = past_tokens_gather(tenc, tokenizer)
g = [list(zip(e, np.arange(len(e)))) for e in res]
print(g)

[[(86, 0)], [(37686, 0)], [(20763, 0)], [(15466, 0)], [(20763, 0), (541, 1)], [(20763, 0), (3757, 1)], [(20763, 0), (46647, 1)], [(20763, 0), (541, 1), (13740, 2)], [(31266, 0)], [(31266, 0), (86, 1)], [(31266, 0), (21638, 1)], [(31266, 0), (86, 1), (349, 2)], [(31266, 0), (86, 1), (10875, 2)], [(31266, 0), (86, 1), (349, 2), (2616, 3)]]


In [233]:
res

[[86],
 [37686],
 [20763],
 [15466],
 [20763, 541],
 [20763, 3757],
 [20763, 46647],
 [20763, 541, 13740],
 [31266],
 [31266, 86],
 [31266, 21638],
 [31266, 86, 349],
 [31266, 86, 10875],
 [31266, 86, 349, 2616]]

In [226]:
e, p, m = create_emb_pos_mask(g)


In [234]:
l = []
for r in m:
    l.append(e[r.astype(bool)].tolist())
l == res


True

In [152]:
for i in range(1, len(tenc)+1):
    partial = tenc[:i]
    print(f"partial: {partial}")
    pdec = tokenizer.decode(partial)
    print(f"pdec: ;{pdec};")
    penc = tokenizer.encode(pdec)
    print(f"penc: {penc}")


partial: [86]
pdec: ;w;
penc: [86]
partial: [86, 72]
pdec: ;wi;
penc: [37686]
partial: [86, 72, 74]
pdec: ;wik;
penc: [20763]
partial: [86, 72, 74, 72]
pdec: ;wiki;
penc: [15466]
partial: [86, 72, 74, 72, 79]
pdec: ;wikip;
penc: [20763, 541]
partial: [86, 72, 74, 72, 79, 68]
pdec: ;wikipe;
penc: [20763, 3757]
partial: [86, 72, 74, 72, 79, 68, 67]
pdec: ;wikiped;
penc: [20763, 46647]
partial: [86, 72, 74, 72, 79, 68, 67, 72]
pdec: ;wikipedi;
penc: [20763, 541, 13740]
partial: [86, 72, 74, 72, 79, 68, 67, 72, 64]
pdec: ;wikipedia;
penc: [31266]
partial: [86, 72, 74, 72, 79, 68, 67, 72, 64, 86]
pdec: ;wikipediaw;
penc: [31266, 86]
partial: [86, 72, 74, 72, 79, 68, 67, 72, 64, 86, 78]
pdec: ;wikipediawo;
penc: [31266, 21638]
partial: [86, 72, 74, 72, 79, 68, 67, 72, 64, 86, 78, 75]
pdec: ;wikipediawol;
penc: [31266, 86, 349]
partial: [86, 72, 74, 72, 79, 68, 67, 72, 64, 86, 78, 75, 77]
pdec: ;wikipediawoln;
penc: [31266, 86, 10875]
partial: [86, 72, 74, 72, 79, 68, 67, 72, 64, 86, 78, 75, 

In [None]:
def validate(txt, emb, pos, mask, tokenizer):
    dec = tokenizer.decode(emb[mask[-1].astype(bool)])
    assert dec == txt

    lemb = tokenize_atoms(txt, tokenizer)
    assert len(lemb) == len(emb)

    comp = pos[mask[-1].astype(bool)]
    for a, b in zip(comp.astype(int), list(range(len(comp)))):
        assert a == b

In [None]:
txt = """ ósiem"""
emb, pos, mask = encode_em(txt, tokenizer)
validate(txt, emb, pos, mask, tokenizer)

In [None]:
txt = " ósiem"

tokens = tokenizer.tokenize(txt)
res_emb = []
res_pos = []
partial_masks = []
pos_len = 0

for t in tokens:
    emb, pos, mask = process_token(t, tokenizer)

    res_emb.extend(emb)
    res_pos.extend(pos+pos_len)
    pos_len += pos[-1] + 1
    partial_masks.append(mask)



In [None]:
t1  = "elo źle żal ósemka no i śląska"
t3 = "example text to tokenize and, żargon ołówka źle wpływa detokenize, ślóza."
t1  = "elo źle żaló"
t3 = "example."
e2  = np.array([69])
p2  = np.array([0])
m2  = np.array([1])
t2 = tokenizer.decode(e2)

e1, p1, m1 = encode_em(t1, tokenizer)
e3, p3, m3 = encode_em(t3, tokenizer)
txt = t1+t2+t3
txt

emb = np.concatenate([e1,e2,e3])
pos = sum_pos([p1,p2,p3])
mlen = m1.shape[0]+m2.shape[0]+m3.shape[0]
mask = sum_masks(mlen, [m1, m2, m3])


In [None]:
# faulty examples
' ósiem'
"ó"
""" ›"""
''

In [None]:

def text_emb_pos(txt, tokenizer):
    emb_pos = []

    letter_split = False
    for l in txt:
        t = tokenizer.encode(l)
        if len(t) > 1:
            letter_split = True
    
    if letter_split:
        for l in txt:
            et = tokenizer.encode(l)
            emb_pos.append(list(zip(et, list(range(len(et))))))
    else:
        for i in range(1, len(txt)+1):
            et = tokenizer.encode(txt[:i])
            emb_pos.append(list(zip(et, list(range(len(et))))))

    return emb_pos

In [None]:
for t in atomizer.keys():
    txt = tokenizer.decode(t)
    emb = tokenize_letters(txt, tokenizer)
    a_emb = atomizer[t][0]
    assert len(emb) == len(a_emb)
    if len(emb) != len(a_emb):
        print(len(emb), len(a_emb))

In [None]:
def tokenize_letters(txt, tokenizer):
    emb = []
    for l in txt:
        emb.extend(tokenizer.encode(l))
    return emb
