In [1]:
"""Prepares the input pipeline for OpenWebText (OWT).

This script tokenizes the OWT dataset and splits it into train and eval sets.
The train and eval sets are saved as ArrayRecord files.
"""

# from array_record.python import array_record_module
import datasets

# import numpy as np
# import tensorflow as tf
# import tqdm
import transformers

# import time


source = datasets.load_dataset(
    "Skylion007/openwebtext", name="plain_text", split="train", streaming=True
)

_GPT2_TOKENIZER = "gpt2"
tokenizer = transformers.GPT2Tokenizer.from_pretrained(_GPT2_TOKENIZER)

# ArrayRecordWriter = array_record_module.ArrayRecordWriter
# ArrayRecordReader = array_record_module.ArrayRecordReader


def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


ds_output_file_train = "./data_dir/openwebtext_splits_1024_train"
ds_output_file_eval = "./data_dir/openwebtext_splits_1024_eval"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import AutoTokenizer
from joblib import Parallel, delayed

In [3]:
def tokenize_chunk(chunk):
    return tokenizer(chunk)


# Main function
def parallel_tokenize(strings, chunk_size=1000, num_jobs=-1):
    """
    Tokenizes a large list of strings in parallel.

    Args:
        strings (list of str): The list of strings to tokenize.
        chunk_size (int): The size of each chunk for processing.
        num_jobs (int): The number of parallel jobs (-1 uses all CPUs).

    Returns:
        dict: A dictionary with tokenized results combined across all chunks.
    """
    # Split data into chunks
    chunks = [strings[i : i + chunk_size] for i in range(0, len(strings), chunk_size)]

    # Tokenize each chunk in parallel
    tokenized_chunks = Parallel(n_jobs=num_jobs)(
        delayed(tokenize_chunk)(chunk) for chunk in chunks
    )

    return tokenized_chunks

In [4]:
block_size = 1024  # size of the chunk

data_iter = iter(source)

In [6]:
all_tokens = []
count = 0
count_per_save = 0
eval_chunks = []

from tqdm import tqdm

text_list = []
for i, example in tqdm(enumerate(data_iter)):
    text_list.append(example["text"])
    if i >= 100 * 512:
        tokenized_result = parallel_tokenize(text_list, chunk_size=100, num_jobs=512)
        break
    continue
    tokens = tokenizer(example["text"])["input_ids"]
    all_tokens.extend(tokens + [tokenizer.eos_token_id])

51193it [00:30, 3546.37it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1025 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1846 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1253 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1164 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1493 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the spe

In [None]:
tokenized_result = parallel_tokenize(text_list, chunk_size=1000, num_jobs=512)

In [13]:
text_list[0]

'Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated the area, saying it was concerned about security.\n\nThe decision left CNN Chief Medical Correspondent Sanjay Gupta as the only doctor at the hospital to get the patients through the night.\n\nCNN initially reported, based on conversations with some of the doctors, that the United Nations ordered the Belgian First Aid and Support Team to evacuate. However, Belgian Chief Coordinator Geert Gijs, a doctor who was at the hospital with 60 Belgian medical personnel, said it was his decision to pull the team out for the night. Gijs said he requested U.N. security personnel to staff the hospital overnight, but was told that peacekeepers would only be able to evacuate the team.\n\nHe said it was a "tough decision" but that he accepted the U.N. offer to evacuate after a Canadian medical team, also

In [None]:
import transformers

_GPT2_TOKENIZER = "gpt2"
tokenizer = transformers.GPT2Tokenizer.from_pretrained(_GPT2_TOKENIZER)



In [17]:
tokenizer.decode([13])

'.'

In [7]:
import datasets
from transformers import GPT2Tokenizer
from multiprocessing import Pool
import os

# Constants
_GPT2_TOKENIZER = "gpt2"
CHUNK_SIZE = 100
NUM_WORKERS = os.cpu_count()  # Number of workers (adjust as needed)

# Load the dataset (streaming mode)
source = datasets.load_dataset(
    "Skylion007/openwebtext", name="plain_text", split="train"
)


def process_chunk(chunk_range):
    """
    Tokenize a chunk of data using skip and take for streaming datasets.

    Args:
        chunk_range (tuple): Start and end indices for the chunk.

    Returns:
        list of dict: Tokenized data.
    """
    start, end = chunk_range
    tokenizer = GPT2Tokenizer.from_pretrained(_GPT2_TOKENIZER)

    # Efficiently access the desired range
    chunk = source.skip(start).take(end - start)
    tokenized_data = [tokenizer(text["text"]) for text in chunk]

    return tokenized_data


def create_chunks(total_size, chunk_size):
    """Create chunk ranges for parallel processing."""
    return [
        (i, min(i + chunk_size, total_size)) for i in range(0, total_size, chunk_size)
    ]


def save_tokenized_data(tokenized_data, output_file):
    """Save tokenized data to a file."""
    with open(output_file, "w") as f:
        for item in tokenized_data:
            f.write(str(item) + "\n")


if __name__ == "__main__":
    total_size = 100000  # Adjust to the number of samples in your dataset
    chunks = create_chunks(total_size, CHUNK_SIZE)

    # Run parallel tokenization
    with Pool(NUM_WORKERS) as pool:
        tokenized_results = pool.map(process_chunk, chunks)

    # Combine and save results
    all_tokenized_data = [item for sublist in tokenized_results for item in sublist]
    # save_tokenized_data(all_tokenized_data, "./data_dir/openwebtext_tokenized.txt")
    import pickle

    output_pickle_file = "./data_dir/openwebtext_tokenized.pkl"
    with open(output_pickle_file, "wb") as f:
        pickle.dump(all_tokenized_data, f)

    print(f"Tokenization complete. Total tokenized entries: {len(all_tokenized_data)}")

Downloading data: 100%|██████████| 21/21 [00:00<00:00, 403.05files/s]
Generating train split:   1%|          | 71276/8013769 [00:15<27:53, 4744.70 examples/s]


KeyboardInterrupt: 

In [14]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt")

NameError: name 'GPT2Tokenizer' is not defined

In [2]:
os.cpu_count()

240

In [9]:
from datasets import load_from_disk

In [10]:
source = load_from_disk("./openwebtext_cache")

In [11]:
source[]

Dataset({
    features: ['text'],
    num_rows: 8013769
})

In [None]:
import pickle

output_pickle_file = "./openwebtext_tokenized.pkl"
with open(output_pickle_file, "rb") as f:
    all_tokenized_data = pickle.load(f)

  from .autonotebook import tqdm as notebook_tqdm
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fecf6ca1db0>>
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/md4-venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [None]:
all_tokenized_data_input_ids = [data["input_ids"] for data in all_tokenized_data]

In [13]:
all_tokenized_data_input_ids[-2][-1]

13

In [1]:
import pickle
import numpy as np

output_pickle_file = "./openwebtext_tokenized_inputids.pkl"
with open(output_pickle_file, "rb") as f:
    all_tokenized_data_input_ids = pickle.load(f)

In [2]:
len(all_tokenized_data_input_ids)

8013769

In [None]:
import pickle
import numpy as np

output_pickle_file = "./openwebtext_tokenized_inputids.pkl"
with open(output_pickle_file, "rb") as f:
    all_tokenized_data_input_ids = pickle.load(f)

# ArrayRecordWriter = array_record_module.ArrayRecordWriter
# ArrayRecordReader = array_record_module.ArrayRecordReader


# def _int64_feature(value):
#     """Returns an int64_list from a bool / enum / int / uint."""
#     return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
import transformers
import numpy as np

_GPT2_TOKENIZER = "gpt2"
tokenizer = transformers.GPT2Tokenizer.from_pretrained(_GPT2_TOKENIZER)

ds_output_file_train = "./data_dir/openwebtext_splits_1024_train"
ds_output_file_eval = "./data_dir/openwebtext_splits_1024_eval"

n_examples = 8013769  # tiny: 2; small: 10_000; full: 8013769
save_every_examples = 100_000
block_size = 1024  # size of the chunk

# data_iter = iter(source)

all_tokens = []
count = 0
count_per_save = 0
eval_chunks = []

# writer_train = ArrayRecordWriter(ds_output_file_train, "group_size:1")
# writer_eval = ArrayRecordWriter(ds_output_file_eval, "group_size:1")

from tqdm import tqdm

for tokens in tqdm(all_tokenized_data_input_ids):
    # tokens = tokenizer(example["text"])["input_ids"]
    all_tokens.extend(tokens + [tokenizer.eos_token_id])
    count += 1
    count_per_save += 1
    
all_tokens = np.array(all_tokens)
np.save("openwebtext_tokenized_inputids.npy", all_tokens)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 8013769/8013769 [03:41<00:00, 36182.86it/s]


In [5]:
all_tokens = np.array(all_tokens)
np.save("openwebtext_tokenized_inputids.npy", all_tokens)

In [4]:
import numpy as np

all_tokens = np.array(all_tokens)

In [7]:
saved_length = (len(all_tokens) // block_size) * block_size
all_tokens_short = all_tokens[:saved_length]

In [9]:
all_tokens_short=all_tokens_short.reshape(-1, block_size)

In [11]:
chunks = all_tokens_short
np.random.shuffle(chunks)
num_eval = int(len(chunks) * 0.02)  # put 2% of chunks into eval split.

In [12]:
eval_chunks = chunks[:num_eval]
train_chunks = chunks[num_eval:]

In [13]:
# np.save("./data_dir/openwebtext_np_train.npy", train_chunks)
# np.save("./data_dir/openwebtext_np_eval.npy", eval_chunks)

In [15]:
train_chunks[0]

array([  257,  1256,   286, ..., 26318,  1757,   371])

In [16]:
train_chunks[1]

array([ 389, 1016,  284, ...,   64,  319, 3909])

In [6]:
import numpy as np

saved_length = (len(all_tokens) // block_size) * block_size
chunks = [
    all_tokens[i : i + block_size] for i in tqdm(range(0, saved_length, block_size))
]

# print("Time taken to tokenize:", time.time() - time1)
print(f"\nsaving chunks @ {count}th example mark...")
np.random.shuffle(chunks)
num_eval = int(len(chunks) * 0.02)  # put 2% of chunks into eval split.

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x79ea1752ddb0>>
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/md4-venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


: 

: 

In [3]:
from array_record.python import array_record_module
import datasets
import numpy as np
import tensorflow as tf
import tqdm
import transformers
import time
import os
from datasets import load_from_disk

  from .autonotebook import tqdm as notebook_tqdm
2025-01-18 08:08:44.027518: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737187724.068846 1569184 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737187724.080157 1569184 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-18 08:08:44.113937: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
_GPT2_TOKENIZER = "gpt2"
tokenizer = transformers.GPT2Tokenizer.from_pretrained(_GPT2_TOKENIZER)

In [14]:
ArrayRecordWriter = array_record_module.ArrayRecordWriter
ArrayRecordReader = array_record_module.ArrayRecordReader


def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


ds_output_file_train = "./data_dir/openwebtext_splits_1024_train"
ds_output_file_eval = "./data_dir/openwebtext_splits_1024_eval"

n_examples = 8013769  # tiny: 2; small: 10_000; full: 8013769
save_every_examples = 100_000
block_size = 1024  # size of the chunk

# data_iter = iter(source)

all_tokens = []
count = 0
count_per_save = 0
eval_chunks = []

writer_train = ArrayRecordWriter(ds_output_file_train, "group_size:1")
writer_eval = ArrayRecordWriter(ds_output_file_eval, "group_size:1")

from tqdm import tqdm

for tokens in tqdm(all_tokenized_data_input_ids):
    # tokens = tokenizer(example["text"])["input_ids"]
    all_tokens.extend(tokens + [tokenizer.eos_token_id])
    count += 1
    count_per_save += 1

    # pause to save when having tokenized enough examples for saving.
    time1 = time.time()
    if count_per_save >= save_every_examples:
        # save to disk
        saved_length = (len(all_tokens) // block_size) * block_size
        chunks = [
            all_tokens[i : i + block_size] for i in range(0, saved_length, block_size)
        ]

        print("Time taken to tokenize:", time.time() - time1)
        print(f"\nsaving chunks @ {count}th example mark...")
        np.random.shuffle(chunks)
        num_eval = int(len(chunks) * 0.02)  # put 2% of chunks into eval split.
        for eval_i in tqdm(range(num_eval)):
            feature = {
                "text": _int64_feature(chunks[eval_i]),
            }
            example_proto = tf.train.Example(
                features=tf.train.Features(feature=feature)
            )
            writer_eval.write(example_proto.SerializeToString())

        for train_i in tqdm(range(num_eval, len(chunks))):
            feature = {
                "text": _int64_feature(chunks[train_i]),
            }
            example_proto = tf.train.Example(
                features=tf.train.Features(feature=feature)
            )
            writer_train.write(example_proto.SerializeToString())
        print("Time taken to save:", time.time() - time1)
        # prepare for the next round of tokenize-n-save.
        all_tokens = all_tokens[saved_length:]
        count_per_save = 0

    # stop when having tokenized enough examples for total #.
    if count >= n_examples:
        # save to disk
        saved_length = (len(all_tokens) // block_size) * block_size
        chunks = [
            all_tokens[i : i + block_size] for i in range(0, saved_length, block_size)
        ]

        print(f"\nsaving chunks @ {count}th example mark...")
        np.random.shuffle(chunks)
        num_eval = int(len(chunks) * 0.02)  # put 2% of chunks into eval split.
        for eval_i in tqdm(range(num_eval)):
            feature = {
                "text": _int64_feature(chunks[eval_i]),
            }
            example_proto = tf.train.Example(
                features=tf.train.Features(feature=feature)
            )
            writer_eval.write(example_proto.SerializeToString())

        for train_i in tqdm(range(num_eval, len(chunks))):
            feature = {
                "text": _int64_feature(chunks[train_i]),
            }
            example_proto = tf.train.Example(
                features=tf.train.Features(feature=feature)
            )
            writer_train.write(example_proto.SerializeToString())
        break

writer_train.close()
writer_eval.close()

  0%|          | 0/8013769 [00:00<?, ?it/s]

  1%|          | 96461/8013769 [00:02<03:07, 42165.58it/s]

Time taken to tokenize: 2.49033784866333

saving chunks @ 100000th example mark...


100%|██████████| 2206/2206 [00:00<00:00, 3557.20it/s]
100%|██████████| 108109/108109 [00:34<00:00, 3173.37it/s]]
  1%|▏         | 108634/8013769 [00:40<2:56:11, 747.78it/s]

Time taken to save: 37.19849228858948


  2%|▏         | 196296/8013769 [00:42<03:34, 36501.44it/s] 

Time taken to tokenize: 2.535370111465454

saving chunks @ 200000th example mark...


100%|██████████| 2224/2224 [00:00<00:00, 3560.31it/s]
100%|██████████| 109011/109011 [00:34<00:00, 3134.93it/s]s]
  3%|▎         | 209135/8013769 [01:21<2:59:27, 724.83it/s]

Time taken to save: 37.94989895820618


  4%|▎         | 299868/8013769 [01:24<03:36, 35708.43it/s] 

Time taken to tokenize: 3.1835684776306152

saving chunks @ 300000th example mark...


100%|██████████| 2217/2217 [00:00<00:00, 3185.07it/s]
100%|██████████| 108649/108649 [00:35<00:00, 3019.57it/s]s]
  4%|▍         | 306737/8013769 [02:05<4:13:03, 507.61it/s]

Time taken to save: 39.8833065032959


  5%|▍         | 396846/8013769 [03:44<03:22, 37606.83it/s] 

Time taken to tokenize: 97.51064085960388

saving chunks @ 400000th example mark...


100%|██████████| 2218/2218 [00:00<00:00, 3443.15it/s]
100%|██████████| 108699/108699 [00:36<00:00, 2998.53it/s]
  5%|▌         | 406821/8013769 [04:22<11:52:04, 178.05it/s]

Time taken to save: 134.4242742061615


  6%|▌         | 497283/8013769 [04:25<03:33, 35227.95it/s] 

Time taken to tokenize: 2.538282632827759

saving chunks @ 500000th example mark...


100%|██████████| 2194/2194 [00:00<00:00, 3300.96it/s]
100%|██████████| 107534/107534 [00:34<00:00, 3127.00it/s]s]
  6%|▋         | 507560/8013769 [05:03<3:15:22, 640.34it/s]

Time taken to save: 37.610230445861816


  7%|▋         | 596452/8013769 [05:06<03:27, 35828.49it/s] 

Time taken to tokenize: 2.514292001724243

saving chunks @ 600000th example mark...


100%|██████████| 2196/2196 [00:00<00:00, 2716.80it/s]
100%|██████████| 107621/107621 [00:33<00:00, 3180.31it/s]s]
  8%|▊         | 607459/8013769 [05:44<3:06:46, 660.90it/s]

Time taken to save: 37.1840717792511


  9%|▊         | 699881/8013769 [05:47<03:45, 32483.17it/s] 

Time taken to tokenize: 2.5763652324676514

saving chunks @ 700000th example mark...


100%|██████████| 2214/2214 [00:00<00:00, 3262.33it/s]
100%|██████████| 108507/108507 [00:33<00:00, 3222.50it/s]s]
  9%|▉         | 705610/8013769 [06:25<4:19:59, 468.49it/s]

Time taken to save: 36.946409940719604


 10%|▉         | 799196/8013769 [06:28<03:12, 37569.31it/s] 

Time taken to tokenize: 2.368776798248291

saving chunks @ 800000th example mark...


100%|██████████| 2186/2186 [00:00<00:00, 3577.33it/s]
100%|██████████| 107155/107155 [00:33<00:00, 3239.00it/s]s]
 10%|█         | 807207/8013769 [07:04<3:18:02, 606.48it/s]

Time taken to save: 36.08050489425659


 11%|█         | 897943/8013769 [07:07<03:08, 37687.04it/s] 

Time taken to tokenize: 2.772588014602661

saving chunks @ 900000th example mark...


100%|██████████| 2200/2200 [00:00<00:00, 3526.72it/s]
100%|██████████| 107848/107848 [00:33<00:00, 3193.51it/s]s]
 11%|█▏        | 902700/8013769 [07:45<5:03:37, 390.35it/s]

Time taken to save: 37.1861457824707


 12%|█▏        | 999501/8013769 [07:48<03:48, 30641.34it/s] 

Time taken to tokenize: 3.512913703918457

saving chunks @ 1000000th example mark...


100%|██████████| 2204/2204 [00:00<00:00, 3304.47it/s]
100%|██████████| 108001/108001 [00:34<00:00, 3093.04it/s]s]
 13%|█▎        | 1006163/8013769 [08:28<4:13:11, 461.30it/s] 

Time taken to save: 39.12090468406677


 14%|█▎        | 1098231/8013769 [08:31<03:46, 30478.59it/s] 

Time taken to tokenize: 2.856701612472534

saving chunks @ 1100000th example mark...


100%|██████████| 2200/2200 [00:00<00:00, 2910.67it/s]
100%|██████████| 107814/107814 [00:36<00:00, 2960.23it/s]/s]
 14%|█▎        | 1101820/8013769 [09:12<6:51:43, 279.79it/s]

Time taken to save: 40.055588245391846


 15%|█▍        | 1198155/8013769 [09:16<03:13, 35229.88it/s] 

Time taken to tokenize: 3.341883420944214

saving chunks @ 1200000th example mark...


100%|██████████| 2207/2207 [00:00<00:00, 3286.65it/s]
100%|██████████| 108182/108182 [00:33<00:00, 3276.61it/s]/s]
 15%|█▌        | 1206578/8013769 [09:54<3:17:14, 575.19it/s]

Time taken to save: 37.05030298233032


 16%|█▌        | 1297020/8013769 [09:56<02:59, 37319.47it/s] 

Time taken to tokenize: 3.163818597793579

saving chunks @ 1300000th example mark...


100%|██████████| 2196/2196 [00:00<00:00, 2843.82it/s]
100%|██████████| 107646/107646 [00:33<00:00, 3231.28it/s]/s]
 16%|█▋        | 1302666/8013769 [10:34<4:34:07, 408.04it/s]

Time taken to save: 37.27259039878845


 17%|█▋        | 1399821/8013769 [10:37<02:56, 37553.81it/s] 

Time taken to tokenize: 2.6891086101531982

saving chunks @ 1400000th example mark...


100%|██████████| 2195/2195 [00:00<00:00, 3326.77it/s]
100%|██████████| 107570/107570 [00:36<00:00, 2979.18it/s]/s]
 18%|█▊        | 1402849/8013769 [11:17<5:37:36, 326.37it/s]

Time taken to save: 39.47388219833374


 19%|█▊        | 1496928/8013769 [11:20<03:12, 33916.20it/s] 

Time taken to tokenize: 3.1439762115478516

saving chunks @ 1500000th example mark...


100%|██████████| 2213/2213 [00:00<00:00, 2772.26it/s]
100%|██████████| 108470/108470 [00:33<00:00, 3233.92it/s]/s]
 19%|█▉        | 1502597/8013769 [11:59<4:38:56, 389.04it/s]

Time taken to save: 37.50323796272278


 20%|█▉        | 1599466/8013769 [12:02<02:52, 37281.24it/s] 

Time taken to tokenize: 2.7182204723358154

saving chunks @ 1600000th example mark...


100%|██████████| 2208/2208 [00:00<00:00, 3303.33it/s]
100%|██████████| 108195/108195 [00:35<00:00, 3076.17it/s]/s]
 20%|██        | 1602971/8013769 [12:41<5:10:42, 343.88it/s]

Time taken to save: 38.5762677192688


 21%|██        | 1697452/8013769 [12:44<02:54, 36252.51it/s] 

Time taken to tokenize: 2.745781660079956

saving chunks @ 1700000th example mark...


100%|██████████| 2220/2220 [00:00<00:00, 2413.06it/s]
100%|██████████| 108780/108780 [00:33<00:00, 3277.11it/s]/s]
 21%|██        | 1702757/8013769 [13:22<4:24:14, 398.05it/s]

Time taken to save: 36.87900924682617


 22%|██▏       | 1799248/8013769 [13:25<02:48, 36982.84it/s] 

Time taken to tokenize: 2.850130081176758

saving chunks @ 1800000th example mark...


100%|██████████| 2193/2193 [00:00<00:00, 2562.28it/s]
100%|██████████| 107459/107459 [00:34<00:00, 3146.79it/s]/s]
 23%|██▎       | 1803139/8013769 [14:03<4:47:09, 360.46it/s]

Time taken to save: 37.87693119049072


 24%|██▎       | 1899615/8013769 [14:06<02:48, 36374.25it/s] 

Time taken to tokenize: 2.324234962463379

saving chunks @ 1900000th example mark...


100%|██████████| 2190/2190 [00:00<00:00, 3123.12it/s]
100%|██████████| 107325/107325 [00:35<00:00, 3031.73it/s]/s]
 24%|██▎       | 1902677/8013769 [14:45<5:12:00, 326.43it/s]

Time taken to save: 38.44513988494873


 25%|██▍       | 1997791/8013769 [14:48<02:42, 37000.41it/s] 

Time taken to tokenize: 2.7004101276397705

saving chunks @ 2000000th example mark...


100%|██████████| 2194/2194 [00:00<00:00, 3354.59it/s]
100%|██████████| 107537/107537 [00:31<00:00, 3378.02it/s]/s]
 25%|██▌       | 2006830/8013769 [15:24<2:38:14, 632.68it/s]

Time taken to save: 35.206560373306274


 26%|██▌       | 2099573/8013769 [15:27<02:39, 37162.37it/s] 

Time taken to tokenize: 2.910207986831665

saving chunks @ 2100000th example mark...


100%|██████████| 2216/2216 [00:00<00:00, 3430.17it/s]
100%|██████████| 108615/108615 [00:33<00:00, 3195.83it/s]/s]
 26%|██▌       | 2102658/8013769 [16:05<4:52:46, 336.50it/s]

Time taken to save: 37.560399293899536


 27%|██▋       | 2197321/8013769 [16:08<02:37, 36838.41it/s] 

Time taken to tokenize: 2.7306032180786133

saving chunks @ 2200000th example mark...


100%|██████████| 2213/2213 [00:00<00:00, 3321.98it/s]
100%|██████████| 108465/108465 [00:34<00:00, 3102.82it/s]/s]
 28%|██▊       | 2206248/8013769 [16:48<2:51:30, 564.34it/s]

Time taken to save: 38.37325167655945


 29%|██▊       | 2298235/8013769 [16:50<02:33, 37235.98it/s] 

Time taken to tokenize: 2.7775516510009766

saving chunks @ 2300000th example mark...


100%|██████████| 2213/2213 [00:00<00:00, 3530.62it/s]
100%|██████████| 108470/108470 [00:32<00:00, 3287.73it/s]/s]
 29%|██▊       | 2302677/8013769 [17:27<4:05:57, 387.01it/s]

Time taken to save: 36.413780212402344


 30%|██▉       | 2397782/8013769 [18:53<02:31, 37030.20it/s] 

Time taken to tokenize: 83.4878351688385

saving chunks @ 2400000th example mark...


100%|██████████| 2222/2222 [00:00<00:00, 3198.88it/s]
100%|██████████| 108906/108906 [00:32<00:00, 3323.21it/s]
 30%|███       | 2410626/8013769 [19:28<6:12:11, 250.91it/s] 

Time taken to save: 116.97152948379517


 31%|███       | 2495998/8013769 [19:30<02:39, 34678.86it/s] 

Time taken to tokenize: 2.4804131984710693

saving chunks @ 2500000th example mark...


100%|██████████| 2193/2193 [00:00<00:00, 3039.48it/s]
100%|██████████| 107464/107464 [00:31<00:00, 3412.54it/s]/s]
 31%|███▏      | 2509642/8013769 [20:05<1:47:26, 853.81it/s]

Time taken to save: 34.71020293235779


 32%|███▏      | 2596591/8013769 [20:08<02:19, 38812.72it/s] 

Time taken to tokenize: 2.528794050216675

saving chunks @ 2600000th example mark...


100%|██████████| 2215/2215 [00:00<00:00, 3187.58it/s]
100%|██████████| 108576/108576 [00:32<00:00, 3303.31it/s]/s]
 32%|███▏      | 2603408/8013769 [20:44<3:03:36, 491.11it/s]

Time taken to save: 36.11000418663025


 34%|███▎      | 2696629/8013769 [20:47<02:24, 36908.35it/s] 

Time taken to tokenize: 2.8296701908111572

saving chunks @ 2700000th example mark...


100%|██████████| 2186/2186 [00:00<00:00, 3606.69it/s]
100%|██████████| 107115/107115 [00:32<00:00, 3330.78it/s]/s]
 34%|███▎      | 2702575/8013769 [21:24<3:25:34, 430.60it/s]

Time taken to save: 35.61534094810486


 35%|███▍      | 2797715/8013769 [21:27<02:22, 36500.77it/s] 

Time taken to tokenize: 2.7469143867492676

saving chunks @ 2800000th example mark...


100%|██████████| 2197/2197 [00:00<00:00, 2995.32it/s]
100%|██████████| 107699/107699 [00:32<00:00, 3362.26it/s]/s]
 35%|███▍      | 2803110/8013769 [22:03<3:25:39, 422.28it/s]

Time taken to save: 35.53132629394531


 36%|███▌      | 2898786/8013769 [22:05<02:19, 36640.78it/s] 

Time taken to tokenize: 2.7291758060455322

saving chunks @ 2900000th example mark...


100%|██████████| 2205/2205 [00:00<00:00, 3096.03it/s]
100%|██████████| 108085/108085 [00:32<00:00, 3292.61it/s]/s]
 36%|███▌      | 2902699/8013769 [22:43<3:51:51, 367.40it/s]

Time taken to save: 36.28621959686279


 37%|███▋      | 2997318/8013769 [22:46<02:16, 36747.50it/s] 

Time taken to tokenize: 2.608959197998047

saving chunks @ 3000000th example mark...


100%|██████████| 2196/2196 [00:00<00:00, 3247.44it/s]
100%|██████████| 107645/107645 [00:31<00:00, 3382.04it/s]/s]
 37%|███▋      | 3003326/8013769 [23:21<3:06:02, 448.88it/s]

Time taken to save: 35.130383014678955


 39%|███▊      | 3099401/8013769 [23:24<02:14, 36655.47it/s] 

Time taken to tokenize: 2.522179126739502

saving chunks @ 3100000th example mark...


100%|██████████| 2183/2183 [00:00<00:00, 3277.60it/s]
100%|██████████| 107009/107009 [00:32<00:00, 3325.67it/s]/s]
 39%|███▉      | 3105922/8013769 [24:00<2:31:28, 540.02it/s]

Time taken to save: 35.381640672683716


 40%|███▉      | 3197468/8013769 [24:03<02:12, 36352.16it/s] 

Time taken to tokenize: 2.782078742980957

saving chunks @ 3200000th example mark...


100%|██████████| 2201/2201 [00:00<00:00, 3157.17it/s]
100%|██████████| 107887/107887 [00:32<00:00, 3282.17it/s]/s]
 40%|███▉      | 3203414/8013769 [24:40<3:07:49, 426.85it/s]

Time taken to save: 36.36699557304382


 41%|████      | 3298127/8013769 [24:43<02:07, 36864.43it/s] 

Time taken to tokenize: 2.694697380065918

saving chunks @ 3300000th example mark...


100%|██████████| 2212/2212 [00:00<00:00, 3599.05it/s]
100%|██████████| 108433/108433 [00:32<00:00, 3301.99it/s]/s]
 41%|████      | 3302658/8013769 [25:20<3:23:36, 385.62it/s]

Time taken to save: 36.16498780250549


 42%|████▏     | 3397931/8013769 [25:23<02:06, 36526.92it/s] 

Time taken to tokenize: 2.720646381378174

saving chunks @ 3400000th example mark...


100%|██████████| 2192/2192 [00:00<00:00, 3289.03it/s]
100%|██████████| 107426/107426 [00:32<00:00, 3326.38it/s]/s]
 42%|████▏     | 3403326/8013769 [26:00<3:01:29, 423.39it/s]

Time taken to save: 35.70170545578003


 44%|████▎     | 3498055/8013769 [26:02<02:03, 36671.29it/s]

Time taken to tokenize: 2.796908378601074

saving chunks @ 3500000th example mark...


100%|██████████| 2211/2211 [00:00<00:00, 3212.23it/s]
100%|██████████| 108341/108341 [00:32<00:00, 3325.16it/s]/s]
 44%|████▎     | 3502788/8013769 [26:39<3:11:37, 392.33it/s]

Time taken to save: 36.0848662853241


 45%|████▍     | 3598467/8013769 [26:42<01:59, 37078.69it/s]

Time taken to tokenize: 2.6867377758026123

saving chunks @ 3600000th example mark...


100%|██████████| 2202/2202 [00:00<00:00, 3438.75it/s]
100%|██████████| 107942/107942 [00:32<00:00, 3302.80it/s]/s]
 45%|████▍     | 3603535/8013769 [27:19<2:57:42, 413.63it/s]

Time taken to save: 36.02655816078186


 46%|████▌     | 3698719/8013769 [27:22<01:57, 36595.15it/s]

Time taken to tokenize: 2.756847858428955

saving chunks @ 3700000th example mark...


100%|██████████| 2205/2205 [00:00<00:00, 2988.12it/s]
100%|██████████| 108048/108048 [00:32<00:00, 3354.14it/s]/s]
 46%|████▌     | 3702898/8013769 [27:59<3:07:40, 382.83it/s]

Time taken to save: 35.72539186477661


 47%|████▋     | 3798908/8013769 [28:01<01:54, 36830.43it/s]

Time taken to tokenize: 2.6971068382263184

saving chunks @ 3800000th example mark...


100%|██████████| 2193/2193 [00:00<00:00, 3315.01it/s]
100%|██████████| 107460/107460 [00:32<00:00, 3331.56it/s]/s]
 47%|████▋     | 3803184/8013769 [28:38<2:57:24, 395.57it/s]

Time taken to save: 35.63102149963379


 49%|████▊     | 3898055/8013769 [28:41<01:52, 36459.43it/s]

Time taken to tokenize: 2.809682607650757

saving chunks @ 3900000th example mark...


100%|██████████| 2211/2211 [00:00<00:00, 3086.92it/s]
100%|██████████| 108355/108355 [00:32<00:00, 3367.30it/s]/s]
 49%|████▊     | 3902594/8013769 [29:17<2:56:39, 387.85it/s]

Time taken to save: 35.72614312171936


 50%|████▉     | 3998331/8013769 [29:20<01:48, 37037.87it/s]

Time taken to tokenize: 2.7227861881256104

saving chunks @ 4000000th example mark...


100%|██████████| 2195/2195 [00:00<00:00, 2949.27it/s]
100%|██████████| 107598/107598 [00:32<00:00, 3336.04it/s]/s]
 50%|████▉     | 4003583/8013769 [29:56<2:37:10, 425.25it/s]

Time taken to save: 35.738200187683105


 51%|█████     | 4097543/8013769 [29:59<01:47, 36589.05it/s]

Time taken to tokenize: 2.7380263805389404

saving chunks @ 4100000th example mark...


100%|██████████| 2209/2209 [00:00<00:00, 3223.14it/s]
100%|██████████| 108251/108251 [00:32<00:00, 3333.70it/s]/s]
 51%|█████     | 4102622/8013769 [30:36<2:42:37, 400.84it/s]

Time taken to save: 35.91330933570862


 52%|█████▏    | 4199038/8013769 [30:39<01:42, 37117.37it/s]

Time taken to tokenize: 2.3793039321899414

saving chunks @ 4200000th example mark...


100%|██████████| 2218/2218 [00:00<00:00, 3016.69it/s]
100%|██████████| 108704/108704 [00:32<00:00, 3392.75it/s]/s]
 52%|█████▏    | 4206780/8013769 [31:15<1:46:36, 595.15it/s]

Time taken to save: 35.17500400543213


 54%|█████▎    | 4299205/8013769 [31:18<01:38, 37688.10it/s]

Time taken to tokenize: 2.3994855880737305

saving chunks @ 4300000th example mark...


100%|██████████| 2195/2195 [00:00<00:00, 3054.41it/s]
100%|██████████| 107600/107600 [00:31<00:00, 3461.55it/s]/s]
 54%|█████▎    | 4302793/8013769 [31:53<2:38:41, 389.75it/s]

Time taken to save: 34.22235083580017


 55%|█████▍    | 4398765/8013769 [33:16<01:38, 36748.23it/s]

Time taken to tokenize: 81.60665225982666

saving chunks @ 4400000th example mark...


100%|██████████| 2222/2222 [00:00<00:00, 3082.36it/s]
100%|██████████| 108883/108883 [00:32<00:00, 3310.86it/s]
 55%|█████▌    | 4407890/8013769 [33:51<4:52:28, 205.48it/s]

Time taken to save: 115.23376679420471


 56%|█████▌    | 4497794/8013769 [33:54<01:40, 34983.06it/s]

Time taken to tokenize: 2.4923439025878906

saving chunks @ 4500000th example mark...


100%|██████████| 2209/2209 [00:00<00:00, 3029.35it/s]
100%|██████████| 108284/108284 [00:33<00:00, 3256.89it/s]/s]
 56%|█████▌    | 4502858/8013769 [34:31<2:27:43, 396.11it/s]

Time taken to save: 36.48664093017578


 57%|█████▋    | 4599190/8013769 [34:34<01:33, 36682.50it/s]

Time taken to tokenize: 2.5848641395568848

saving chunks @ 4600000th example mark...


100%|██████████| 2209/2209 [00:00<00:00, 3144.88it/s]
100%|██████████| 108274/108274 [00:31<00:00, 3472.39it/s]/s]
 57%|█████▋    | 4602956/8013769 [35:09<2:26:30, 388.03it/s]

Time taken to save: 34.48715281486511


 59%|█████▊    | 4698791/8013769 [35:12<01:29, 37124.92it/s]

Time taken to tokenize: 2.815823793411255

saving chunks @ 4700000th example mark...


100%|██████████| 2204/2204 [00:00<00:00, 3324.61it/s]
100%|██████████| 108038/108038 [00:31<00:00, 3434.73it/s]/s]
 59%|█████▊    | 4702814/8013769 [35:48<2:22:32, 387.13it/s]

Time taken to save: 34.95064926147461


 60%|█████▉    | 4797409/8013769 [35:51<01:27, 36968.06it/s]

Time taken to tokenize: 2.8006811141967773

saving chunks @ 4800000th example mark...


100%|██████████| 2191/2191 [00:00<00:00, 3148.99it/s]
100%|██████████| 107370/107370 [00:31<00:00, 3437.40it/s]/s]
 60%|█████▉    | 4803395/8013769 [36:26<1:59:02, 449.48it/s]

Time taken to save: 34.74898433685303


 61%|██████    | 4899845/8013769 [36:29<01:24, 36674.30it/s]

Time taken to tokenize: 2.8246090412139893

saving chunks @ 4900000th example mark...


100%|██████████| 2219/2219 [00:00<00:00, 3194.74it/s]
100%|██████████| 108765/108765 [00:31<00:00, 3453.51it/s]/s]
 61%|██████    | 4902787/8013769 [37:05<2:26:25, 354.12it/s]

Time taken to save: 35.02985453605652


 62%|██████▏   | 4999970/8013769 [37:08<01:21, 37072.00it/s]

Time taken to tokenize: 2.719108819961548

saving chunks @ 5000000th example mark...


100%|██████████| 2193/2193 [00:00<00:00, 3411.45it/s]
100%|██████████| 107488/107488 [00:31<00:00, 3419.97it/s]/s]
 62%|██████▏   | 5003441/8013769 [37:44<2:11:16, 382.21it/s]

Time taken to save: 34.80859088897705


 64%|██████▎   | 5099927/8013769 [37:46<01:18, 37199.40it/s]

Time taken to tokenize: 2.8254265785217285

saving chunks @ 5100000th example mark...


100%|██████████| 2220/2220 [00:00<00:00, 3459.72it/s]
100%|██████████| 108783/108783 [00:31<00:00, 3410.06it/s]/s]
 64%|██████▎   | 5102881/8013769 [38:23<2:16:56, 354.25it/s]

Time taken to save: 35.38505291938782


 65%|██████▍   | 5198363/8013769 [38:25<01:15, 37171.68it/s]

Time taken to tokenize: 2.7836763858795166

saving chunks @ 5200000th example mark...


100%|██████████| 2213/2213 [00:00<00:00, 3607.00it/s]
100%|██████████| 108458/108458 [00:31<00:00, 3420.69it/s]/s]
 65%|██████▍   | 5203259/8013769 [39:02<1:52:57, 414.67it/s]

Time taken to save: 35.122440576553345


 66%|██████▌   | 5299741/8013769 [39:04<01:12, 37202.32it/s]

Time taken to tokenize: 2.8502707481384277

saving chunks @ 5300000th example mark...


100%|██████████| 2220/2220 [00:00<00:00, 3632.32it/s]
100%|██████████| 108808/108808 [00:32<00:00, 3400.03it/s]/s]
 66%|██████▌   | 5302553/8013769 [39:41<2:10:44, 345.64it/s]

Time taken to save: 35.48035740852356


 67%|██████▋   | 5398914/8013769 [39:43<01:09, 37548.29it/s]

Time taken to tokenize: 2.7801899909973145

saving chunks @ 5400000th example mark...


100%|██████████| 2184/2184 [00:00<00:00, 3628.53it/s]
100%|██████████| 107064/107064 [00:31<00:00, 3403.68it/s]/s]
 67%|██████▋   | 5403288/8013769 [40:19<1:47:13, 405.79it/s]

Time taken to save: 34.85517597198486


 69%|██████▊   | 5499298/8013769 [40:22<01:08, 36939.63it/s]

Time taken to tokenize: 2.7952518463134766

saving chunks @ 5500000th example mark...


100%|██████████| 2214/2214 [00:00<00:00, 3451.82it/s]
100%|██████████| 108487/108487 [00:31<00:00, 3497.22it/s]/s]
 69%|██████▊   | 5502655/8013769 [40:57<1:52:55, 370.62it/s]

Time taken to save: 34.47511267662048


 70%|██████▉   | 5598619/8013769 [41:00<01:03, 37880.12it/s]

Time taken to tokenize: 2.7575271129608154

saving chunks @ 5600000th example mark...


100%|██████████| 2204/2204 [00:00<00:00, 3142.43it/s]
100%|██████████| 108044/108044 [00:30<00:00, 3522.49it/s]/s]
 70%|██████▉   | 5603497/8013769 [41:35<1:32:51, 432.61it/s]

Time taken to save: 34.148969411849976


 71%|███████   | 5696622/8013769 [41:38<01:02, 37001.24it/s]

Time taken to tokenize: 2.501344680786133

saving chunks @ 5700000th example mark...


100%|██████████| 2216/2216 [00:00<00:00, 3451.89it/s]
100%|██████████| 108597/108597 [00:31<00:00, 3395.88it/s]/s]
 71%|███████   | 5706083/8013769 [42:14<1:01:23, 626.43it/s]

Time taken to save: 35.14048171043396


 72%|███████▏  | 5799086/8013769 [42:17<00:59, 37329.18it/s]

Time taken to tokenize: 2.4699251651763916

saving chunks @ 5800000th example mark...


100%|██████████| 2191/2191 [00:00<00:00, 3594.70it/s]
100%|██████████| 107393/107393 [00:31<00:00, 3421.90it/s]/s]
 72%|███████▏  | 5803410/8013769 [42:52<1:30:01, 409.19it/s]

Time taken to save: 34.48065948486328


 74%|███████▎  | 5899358/8013769 [42:55<00:57, 36844.09it/s]

Time taken to tokenize: 2.8689451217651367

saving chunks @ 5900000th example mark...


100%|██████████| 2210/2210 [00:00<00:00, 3284.88it/s]
100%|██████████| 108310/108310 [00:31<00:00, 3484.25it/s]/s]
 74%|███████▎  | 5902594/8013769 [43:30<1:36:03, 366.27it/s]

Time taken to save: 34.64538311958313


 75%|███████▍  | 5998095/8013769 [43:33<00:54, 37275.76it/s]

Time taken to tokenize: 2.8160552978515625

saving chunks @ 6000000th example mark...


100%|██████████| 2196/2196 [00:00<00:00, 3248.92it/s]
100%|██████████| 107640/107640 [00:31<00:00, 3435.99it/s]/s]
 75%|███████▍  | 6003497/8013769 [44:09<1:16:14, 439.45it/s]

Time taken to save: 34.83650541305542


 76%|███████▌  | 6099575/8013769 [44:11<00:51, 37066.83it/s]

Time taken to tokenize: 2.911435604095459

saving chunks @ 6100000th example mark...


100%|██████████| 2195/2195 [00:00<00:00, 3505.90it/s]
100%|██████████| 107567/107567 [00:31<00:00, 3453.99it/s]/s]
 76%|███████▌  | 6102733/8013769 [44:47<1:26:28, 368.33it/s]

Time taken to save: 34.701364040374756


 77%|███████▋  | 6197821/8013769 [44:49<00:48, 37216.52it/s]

Time taken to tokenize: 2.8570754528045654

saving chunks @ 6200000th example mark...


100%|██████████| 2199/2199 [00:00<00:00, 3362.07it/s]
100%|██████████| 107785/107785 [00:31<00:00, 3416.14it/s]/s]
 77%|███████▋  | 6203708/8013769 [45:25<1:07:20, 447.93it/s]

Time taken to save: 35.08004069328308


 79%|███████▊  | 6299863/8013769 [45:28<00:46, 37064.85it/s]

Time taken to tokenize: 2.794567346572876

saving chunks @ 6300000th example mark...


100%|██████████| 2198/2198 [00:00<00:00, 3572.61it/s]
100%|██████████| 107745/107745 [00:31<00:00, 3392.25it/s]/s]
 79%|███████▊  | 6302871/8013769 [46:04<1:19:48, 357.32it/s]

Time taken to save: 35.190139293670654


 80%|███████▉  | 6396664/8013769 [47:27<00:43, 37159.84it/s]

Time taken to tokenize: 81.3326678276062

saving chunks @ 6400000th example mark...


100%|██████████| 2195/2195 [00:00<00:00, 3528.06it/s]
100%|██████████| 107566/107566 [00:31<00:00, 3452.59it/s]
 80%|███████▉  | 6408376/8013769 [48:01<1:54:34, 233.53it/s]

Time taken to save: 113.12678027153015


 81%|████████  | 6498176/8013769 [48:03<00:44, 34125.87it/s]

Time taken to tokenize: 2.468920946121216

saving chunks @ 6500000th example mark...


100%|██████████| 2213/2213 [00:00<00:00, 3157.77it/s]
100%|██████████| 108451/108451 [00:32<00:00, 3352.34it/s]/s]
 81%|████████  | 6502871/8013769 [48:40<1:03:03, 399.30it/s]

Time taken to save: 35.538190603256226


 82%|████████▏ | 6597622/8013769 [48:42<00:39, 36099.59it/s]

Time taken to tokenize: 2.5922133922576904

saving chunks @ 6600000th example mark...


100%|██████████| 2189/2189 [00:00<00:00, 3551.57it/s]
100%|██████████| 107265/107265 [00:31<00:00, 3452.31it/s]/s]
 82%|████████▏ | 6607217/8013769 [49:17<35:13, 665.50it/s]  

Time taken to save: 34.30036926269531


 84%|████████▎ | 6699587/8013769 [49:20<00:35, 37177.15it/s]

Time taken to tokenize: 2.8370492458343506

saving chunks @ 6700000th example mark...


100%|██████████| 2181/2181 [00:00<00:00, 3059.94it/s]
100%|██████████| 106870/106870 [00:30<00:00, 3460.04it/s]/s]
 84%|████████▎ | 6702864/8013769 [49:55<58:00, 376.66it/s]  

Time taken to save: 34.45401191711426


 85%|████████▍ | 6799049/8013769 [49:58<00:32, 37083.95it/s]

Time taken to tokenize: 3.0644898414611816

saving chunks @ 6800000th example mark...


100%|██████████| 2196/2196 [00:00<00:00, 3379.43it/s]
100%|██████████| 107649/107649 [00:31<00:00, 3418.23it/s]/s]
 85%|████████▍ | 6807163/8013769 [50:34<33:10, 606.12it/s]  

Time taken to save: 35.225547790527344


 86%|████████▌ | 6899425/8013769 [50:37<00:29, 37445.53it/s]

Time taken to tokenize: 2.9567136764526367

saving chunks @ 6900000th example mark...


100%|██████████| 2204/2204 [00:00<00:00, 3586.33it/s]
100%|██████████| 108028/108028 [00:31<00:00, 3472.22it/s]/s]
 86%|████████▌ | 6902764/8013769 [51:12<49:42, 372.54it/s]  

Time taken to save: 34.70098519325256


 87%|████████▋ | 6996274/8013769 [51:15<00:27, 37211.40it/s]

Time taken to tokenize: 2.724414825439453

saving chunks @ 7000000th example mark...


100%|██████████| 2193/2193 [00:00<00:00, 3133.54it/s]
100%|██████████| 107471/107471 [00:31<00:00, 3417.62it/s]/s]
 87%|████████▋ | 7003720/8013769 [51:51<33:49, 497.75it/s]  

Time taken to save: 34.888914585113525


 89%|████████▊ | 7096307/8013769 [51:53<00:24, 37154.50it/s]

Time taken to tokenize: 2.8196020126342773

saving chunks @ 7100000th example mark...


100%|██████████| 2193/2193 [00:00<00:00, 3379.66it/s]
100%|██████████| 107458/107458 [00:30<00:00, 3514.48it/s]/s]
 89%|████████▊ | 7102803/8013769 [52:28<32:27, 467.76it/s]  

Time taken to save: 34.06209969520569


 90%|████████▉ | 7199037/8013769 [52:31<00:21, 37383.34it/s]

Time taken to tokenize: 2.7814478874206543

saving chunks @ 7200000th example mark...


100%|██████████| 2194/2194 [00:00<00:00, 3240.73it/s]
100%|██████████| 107533/107533 [00:30<00:00, 3517.59it/s]/s]
 90%|████████▉ | 7203765/8013769 [53:06<31:22, 430.23it/s]  

Time taken to save: 34.04829668998718


 91%|█████████ | 7296221/8013769 [53:09<00:19, 37541.69it/s]

Time taken to tokenize: 2.885383129119873

saving chunks @ 7300000th example mark...


100%|██████████| 2203/2203 [00:00<00:00, 3602.83it/s]
100%|██████████| 107954/107954 [00:31<00:00, 3475.90it/s]/s]
 91%|█████████ | 7302751/8013769 [53:44<25:36, 462.74it/s]  

Time taken to save: 34.57092905044556


 92%|█████████▏| 7396941/8013769 [53:47<00:16, 37678.62it/s]

Time taken to tokenize: 2.695134162902832

saving chunks @ 7400000th example mark...


100%|██████████| 2184/2184 [00:00<00:00, 3368.04it/s]
100%|██████████| 107056/107056 [00:30<00:00, 3474.14it/s]/s]
 92%|█████████▏| 7403647/8013769 [54:22<20:57, 485.09it/s]  

Time taken to save: 34.17591714859009


 94%|█████████▎| 7499598/8013769 [54:25<00:13, 37065.58it/s]

Time taken to tokenize: 2.800828456878662

saving chunks @ 7500000th example mark...


100%|██████████| 2191/2191 [00:00<00:00, 3185.24it/s]
100%|██████████| 107392/107392 [00:31<00:00, 3412.11it/s]/s]
 94%|█████████▎| 7502844/8013769 [55:01<23:18, 365.36it/s]  

Time taken to save: 34.98054313659668


 95%|█████████▍| 7597663/8013769 [55:04<00:11, 36608.27it/s]

Time taken to tokenize: 2.7611470222473145

saving chunks @ 7600000th example mark...


100%|██████████| 2198/2198 [00:00<00:00, 3140.93it/s]
100%|██████████| 107720/107720 [00:31<00:00, 3458.13it/s]/s]
 95%|█████████▍| 7607234/8013769 [55:39<10:20, 655.13it/s]  

Time taken to save: 34.630077838897705


 96%|█████████▌| 7699318/8013769 [55:42<00:08, 37106.40it/s]

Time taken to tokenize: 2.803788185119629

saving chunks @ 7700000th example mark...


100%|██████████| 2212/2212 [00:00<00:00, 3641.71it/s]
100%|██████████| 108416/108416 [00:30<00:00, 3519.93it/s]/s]
 96%|█████████▌| 7700000/8013769 [56:17<19:31, 267.76it/s]  

Time taken to save: 34.22974944114685


 97%|█████████▋| 7797294/8013769 [56:20<00:05, 37142.03it/s]

Time taken to tokenize: 2.855628490447998

saving chunks @ 7800000th example mark...


100%|██████████| 2213/2213 [00:00<00:00, 3649.53it/s]
100%|██████████| 108482/108482 [00:31<00:00, 3410.48it/s]/s]
 97%|█████████▋| 7806930/8013769 [56:56<05:22, 641.91it/s]  

Time taken to save: 35.287567138671875


 99%|█████████▊| 7899446/8013769 [56:59<00:03, 36671.73it/s]

Time taken to tokenize: 2.8178677558898926

saving chunks @ 7900000th example mark...


100%|██████████| 2193/2193 [00:00<00:00, 3507.93it/s]
100%|██████████| 107458/107458 [00:30<00:00, 3577.52it/s]/s]
 99%|█████████▊| 7908224/8013769 [57:33<02:34, 683.35it/s]  

Time taken to save: 33.49688220024109


100%|█████████▉| 7997733/8013769 [57:36<00:00, 36942.20it/s]

Time taken to tokenize: 2.4149179458618164

saving chunks @ 8000000th example mark...


100%|██████████| 2205/2205 [00:00<00:00, 3375.42it/s]
100%|██████████| 108089/108089 [00:31<00:00, 3444.99it/s]/s]
100%|█████████▉| 8007351/8013769 [58:11<00:09, 668.11it/s]  

Time taken to save: 34.462011098861694


100%|█████████▉| 8011000/8013769 [58:11<00:02, 957.70it/s]


saving chunks @ 8013769th example mark...


100%|██████████| 301/301 [00:00<00:00, 3661.54it/s]
100%|██████████| 14771/14771 [00:04<00:00, 3457.95it/s]
100%|█████████▉| 8013768/8013769 [58:16<00:00, 2291.79it/s]


In [8]:
tokens

(0,
 [13924,
  12,
  559,
  12,
  35784,
  11,
  25051,
  357,
  18474,
  8,
  1377,
  45591,
  4970,
  11,
  1319,
  44556,
  287,
  2356,
  290,
  44787,
  379,
  1204,
  11,
  7342,
  7519,
  290,
  20669,
  2513,
  1497,
  422,
  257,
  2214,
  4436,
  3217,
  1755,
  706,
  257,
  21402,
  3315,
  1074,
  23724,
  262,
  1989,
  11,
  2282,
  340,
  373,
  5213,
  546,
  2324,
  13,
  198,
  198,
  464,
  2551,
  1364,
  8100,
  5953,
  8366,
  34428,
  298,
  2986,
  33708,
  42095,
  355,
  262,
  691,
  6253,
  379,
  262,
  4436,
  284,
  651,
  262,
  3871,
  832,
  262,
  1755,
  13,
  198,
  198,
  18474,
  7317,
  2098,
  11,
  1912,
  319,
  10275,
  351,
  617,
  286,
  262,
  7519,
  11,
  326,
  262,
  1578,
  7973,
  6149,
  262,
  21402,
  3274,
  22225,
  290,
  7929,
  4816,
  284,
  36316,
  13,
  2102,
  11,
  21402,
  5953,
  36831,
  2269,
  861,
  402,
  2926,
  82,
  11,
  257,
  6253,
  508,
  373,
  379,
  262,
  4436,
  351,
  3126,
  21402,
  3315,
  8213