In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from simple_ml.tokenizers import bpe
import json
from pathlib import Path
from tqdm.auto import tqdm
import concurrent.futures
import orjson

In [3]:
datafolder = Path('../../data/arxiv/')

In [4]:
def process_file(filename):
    local_abstracts = []
    local_count = 0
    with open(filename, 'rb') as f:
        for line in f:
            local_count += 1
            try:
                obj = orjson.loads(line)
                abstract = obj.get('abstract', b'').decode('utf-8') if isinstance(obj.get('abstract', ''), bytes) else obj.get('abstract', '')
                local_abstracts.append(abstract)
            except orjson.JSONDecodeError:
                # Handle or skip malformed JSON lines if necessary
                pass
    return local_count, local_abstracts

In [5]:
# Collect all JSONL files
files = list(datafolder.glob('*.jsonl'))

In [6]:
n_samples = 0
abstracts = []

# Use ThreadPoolExecutor for I/O-bound tasks
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Use tqdm to display a progress bar
    results = list(tqdm(executor.map(process_file, files), total=len(files), desc="Processing files"))

Processing files:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Aggregate results
for count, abstracts_list in results:
    n_samples += count
    abstracts.extend(abstracts_list)

In [8]:
# Join all abstracts into a single string
s = ''.join(abstracts)

In [16]:
tokenizer = bpe.BasicTokenizer(vocab_size=1024)

In [17]:
# just use the first 10K characters to avoid memory overflow
tokenizer.train(s[:10000])

Training BPE:   0%|          | 0/768 [00:00<?, ?it/s]

In [6]:
s = "नमस्ते"

In [7]:
a = list(s.encode('utf-8'))

In [8]:
vocab = {i:bytes([i]) for i in range(256)}

In [9]:
a[:3]

[224, 164, 168]

In [10]:
b"".join([vocab.get(ai) for ai in a[:3]]).decode('utf-8')

'न'

In [11]:
b"".join([vocab.get(ai) for ai in a][:-6]).decode('utf-8', errors='replace')

'नमस्'