In [1]:
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import IPython

sys.path += ['../src/', '../src/utils']

In [2]:
%load_ext autoreload
%autoreload 1
%aimport datasets
%aimport general
%aimport training

In [3]:
from general import *

In [4]:
actions_path = '../data/actions.pkl'

import pickle
with open(actions_path, 'rb') as f:
    actions = pickle.load(f)

In [5]:
from transformers import GPT2Tokenizer, AutoTokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer_name = 'gpt2'

In [6]:
tokenizer('hello there').input_ids + [12]

[31373, 612, 12]

In [7]:
control_tokens = {'eos': tokenizer.eos_token} | datasets.CONTROL_TOKENS
tokenizer.add_tokens([tok for k, tok in control_tokens.items() if k != 'eos'])

for k, tok in control_tokens.items():
    print(k.ljust(20), tok.ljust(20), tokenizer.encode(tok)[0])

eos                  <|endoftext|>        50256
goes                 <|goes|>             50257
eoa                  <|endofaction|>      50258
message              <|message|>          50259
reaction             <|reaction|>         50260
idle                 <|idle|>             50261
attachment           <|attachment|>       50262


In [8]:
ids = tokenizer('p<|goes|><|message|>test').input_ids
print(ids)
for i in range(len(ids)):
    print(tokenizer.decode(ids[:i]))

[79, 50257, 50259, 9288]

p
p <|goes|>
p <|goes|> <|message|>


In [20]:
tokenized = llmap(lambda a: datasets.tokenize_action(a, tokenizer, control_tokens), actions[:])

eta 1335 s
eta 656 s
eta 326 s
eta 162 s
eta 77 s
eta 37 s
eta 17 s


In [21]:
def count_tokens(tokenized):
    all = [t for tt in tokenized for t in tt]
    total = 0
    non_masked = 0
    
    for tok in all:
        ids, mask = tok['ids'], tok['mask']
        total += len(ids)
        non_masked += len(ids[~mask])

    print('Total tokens  ', total)
    print('Non-masked tokens ', non_masked)

count_tokens(tokenized)

Total tokens   14341751
Non-masked tokens  6194936


In [22]:
i = 3
# j = np.random.choice(len(actions[i]))
j = 32
a = actions[i][j]
datasets.tokenize_action(a, tokenizer)

{'ids': array([   79, 50257, 50259,  5450,  1378, 15410,   585,    13,  1130,
           14,    21,   365,    84, 10917, 50258]),
 'mask': array([False, False, False,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True, False])}

In [31]:
i = 3
j = np.random.choice(len(actions[i]))
# j = 30
for a, t in zip(actions[i][j:j+20], tokenized[i][j:j+20]):
    s1, m1 = datasets.action_to_string(a, return_mask=True, control_tokens=control_tokens)
    # print(s1, end='')
    # continue
    datasets.view_masked(s1, m1)
    sbuf = list(map(str, t['ids']))
    display(IPython.display.HTML(f"""<code>        ids {' '.join(sbuf)}</code>"""))
    display(IPython.display.HTML(f"""<code>       mask {' '.join([('^'if m else ' ') * len(s) for m, s in zip(t['mask'], sbuf)])}</code>"""))
    print('\n'*2)





































































































In [32]:
tokenized_path = f"../data/tokenized-{tokenizer_name}.pkl"

import pickle
with open(tokenized_path, 'wb') as f:
    pickle.dump(tokenized, f)