In [20]:
import os
import numpy as np
from cached_path import cached_path

from olmo.config import TrainConfig
from olmo.data import build_memmap_dataset
from olmo.tokenizer import Tokenizer

---

In [11]:
def get_batch_instances(
    dataset: np.memmap, global_indices: np.memmap, batch_size: int, batch_idx: int
) -> list[list[int]]:
    # get an individual batch of instances
    batch_start = batch_idx * batch_size
    batch_end = (batch_idx + 1) * batch_size
    batch_indices = global_indices[batch_start:batch_end]
    batch_instances = []
    for index in batch_indices:
        token_ids = dataset[index]["input_ids"]
        batch_instances.append(token_ids)
    return np.array(batch_instances)


---

In [12]:
# Specify the path to the data order file and the train config file
checkpoint_dir = "/network/scratch/m/marius.mosbach/olmo/checkpoints/slim_pajama/OLMo-136M-84M-9ec76061bd866c49"
data_order_file_path = os.path.join(checkpoint_dir, "train_data", "global_indices.npy")
train_config_path = os.path.join(checkpoint_dir, "config.yaml")

In [16]:
# load the train config
cfg = TrainConfig.load(train_config_path)

# build the memmap dataset
dataset = build_memmap_dataset(cfg, cfg.data)
global_indices = np.memmap(data_order_file_path, mode="r+", dtype=np.uint32)

In [18]:
# get all token IDs in the first batch
batch_idx = 0
batch_instances = get_batch_instances(dataset, global_indices, cfg.global_train_batch_size, batch_idx)
batch_instances.shape

(1024, 1024)

In [22]:
# load the tokenizer
tokenizer = Tokenizer.from_file(cfg.tokenizer.identifier)

In [27]:
# decode every sequence in the batch
decoded_batch = [tokenizer.decode(instance) for instance in batch_instances]
print(len(decoded_batch))

1024


In [30]:
# print the first sequence
# note: each sequence might consist of multiple documents separated by '<|endoftext|>'
# note: documents are the individual instances in the source dataset
print(decoded_batch[0])

 headers: { Accept: 'application/octet-stream' },
          parser: proc { |body, _|
                    if body.encoding == Encoding::ASCII_8BIT # binary response
                      ::Gitlab::FileResponse.new StringIO.new(body, 'rb+')
                    else # error with json response
                      ::Gitlab::Request.parse(body)
                    end
                  })
    end

    # Gets a list of builds for specific commit in a project.
    #
    # @example
    #   Gitlab.commit_builds(5, 'asdf')
    #   Gitlab.commit_builds(5, 'asdf', { per_page: 10, page: 2 })
    #
    # @param  [Integer, String] project The ID or name of a project.
    # @param  [String] sha The SHA checksum of a commit.
    # @param  [Hash] options A customizable set of options.
    # @option options [Integer] :page The page number.
    # @option options [Integer] :per_page The number of results per page.
    # @return [Array<Gitlab::ObjectifiedHash>] The list of builds.
    def commit_builds(pro

In [38]:
# convert every sequence to individual tokens
individual_tokens = [[tokenizer.base_tokenizer.id_to_token(id) for id in instance] for instance in batch_instances]
print(len(individual_tokens))
print(len(individual_tokens[0]))


1024
1024


In [46]:
# check if the sequence consinsts of multiple documents
if '<|endoftext|>' in individual_tokens[0]:
    document_boundaries = [i for i, token in enumerate(individual_tokens[0]) if token == '<|endoftext|>']
    print(document_boundaries)

    # get the first document
    first_document = individual_tokens[0][:document_boundaries[0]]
    print(first_document)


[615]
['Ġheaders', ':', 'Ġ{', 'ĠAccept', ':', "Ġ'", 'application', '/octet', '-stream', "'", 'Ġ},Ċ', 'ĠĠĠĠĠĠĠĠĠ', 'Ġparser', ':', 'Ġproc', 'Ġ{', 'Ġ|', 'body', ',', 'Ġ_', '|Ċ', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġif', 'Ġbody', '.encoding', 'Ġ==', 'ĠEncoding', '::', 'ASCII', '_', '8', 'BIT', 'Ġ#', 'Ġbinary', 'Ġresponse', 'Ċ', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġ::', 'Git', 'lab', '::', 'File', 'Response', '.new', 'ĠStringIO', '.new', '(body', ',', "Ġ'", 'rb', '+', "')Ċ", 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġelse', 'Ġ#', 'Ġerror', 'Ġwith', 'Ġjson', 'Ġresponse', 'Ċ', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġ::', 'Git', 'lab', '::', 'Request', '.parse', '(body', ')Ċ', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġend', 'Ċ', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġ})Ċ', 'ĠĠĠ', 'Ġend', 'ĊĊ', 'ĠĠĠ', 'Ġ#', 'ĠGets', 'Ġa', 'Ġlist', 'Ġof', 'Ġbuilds', 'Ġfor', 'Ġspecific', 'Ġcommit', 'Ġin', 'Ġa', 'Ġproject', '.Ċ', 'ĠĠĠ', 'Ġ#Ċ', 'ĠĠĠ', 'Ġ#', 'Ġ@', 'example', 'Ċ', 'ĠĠĠ', 'Ġ#', 'ĠĠ', 'ĠGit', 'lab', '.commit', '_build', 's', '(', '5', ',', "Ġ'", 'asdf', "')Ċ", 'ĠĠĠ', 'Ġ#', 'ĠĠ', 'ĠGit'