In [57]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import re

In [70]:
def strip(s):
    return re.sub(r'[^a-zA-Z0-9.\"\',!? ]+', '', s)

In [13]:
dset = load_dataset("roneneldan/TinyStories", cache_dir='/scratch/bbjr/mallina1/data/huggingface_cache_dir')
dset

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [11]:
dset['train'][0]

{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}

In [101]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]"], min_frequency=200)

In [102]:
val = [strip(x['text']) for x in dset['validation']]

In [103]:
val[0]

'Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled and replied, "Thank you, Spot. I polish it every day."After playing with the car, Kitty and Spot felt thirsty. They found a small pond with clear water. They drank the water and felt very happy. They played together all day and became best friends.'

In [104]:
tokenizer.train_from_iterator(val, trainer=trainer, length=dset['validation'].num_rows)






In [105]:
print(tokenizer.get_vocab_size())
tokenizer.get_vocab()

2549


{'sticks': 2226,
 'supp': 2267,
 'stay': 613,
 'heavy': 1375,
 'Ellie': 2499,
 'fted': 2217,
 'careful': 481,
 'Le': 1647,
 'working': 2338,
 'noticed': 780,
 'gives': 1868,
 'tim': 168,
 'gy': 2191,
 'egg': 1960,
 'whale': 2379,
 'best': 578,
 'others': 1180,
 'tre': 306,
 'dirt': 1905,
 'waves': 2162,
 'cra': 939,
 'oug': 276,
 'tie': 2107,
 'nts': 996,
 'ham': 2295,
 'stead': 1422,
 'breath': 1995,
 'ite': 414,
 'band': 1662,
 'weet': 753,
 'duck': 974,
 't': 63,
 'clown': 2167,
 'Dave': 2482,
 'squirrel': 1187,
 'near': 770,
 'Of': 1746,
 'shout': 808,
 'puddle': 1949,
 'ize': 1813,
 'not': 185,
 'hide': 1310,
 'angry': 670,
 'spin': 1618,
 'shoes': 1575,
 'aybe': 824,
 'member': 746,
 'Leo': 2144,
 'ca': 305,
 'ht': 184,
 'ro': 180,
 'runk': 2373,
 'disappeared': 2366,
 'ix': 1466,
 'Sus': 1986,
 'Fluffy': 1533,
 'past': 1705,
 'pract': 1816,
 'ungle': 1863,
 'pleased': 2311,
 'shirt': 1921,
 'uffy': 1277,
 'glass': 2280,
 'bathroom': 2427,
 'places': 1791,
 'pp': 128,
 'apple': 1

In [106]:
print(dset['validation'][0]['text'])
enc = tokenizer.encode(dset['validation'][0]['text'])
tokenizer.decode(enc.ids)

Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled and replied, "Thank you, Spot. I polish it every day."

After playing with the car, Kitty and Spot felt thirsty. They found a small pond with clear water. They drank the water and felt very happy. They played together all day and became best friends.


'Spot . Spot saw the shiny car and said , " Wow , Kitty , your car is so bright and clean !" Kitty smiled and replied , " Thank you , Spot . I pol ish it every day ." After playing with the car , Kitty and Spot felt th irst y . They found a small pond with clear water . They drank the water and felt very happy . They played together all day and became best friends .'