In [1]:
import os

import numpy as np
from transformers import AutoTokenizer
import pandas as pd

from olmo.data import build_memmap_dataset
from olmo.config import TrainConfig
from olmo.tokenizer import Tokenizer

In [2]:
config_path = "/home/mila/m/marius.mosbach/projects/olmo/configs/slim_pajama/OLMo-59M.yaml"
training_config = TrainConfig.load(config_path)
seq_length = training_config.model.max_sequence_length

In [None]:
# load the tokenizer
tokenizer_name = training_config.tokenizer.identifier
print("Loading tokenizer:", tokenizer_name)
tokenizer = Tokenizer.from_file(tokenizer_name)

In [4]:
# Build a memmap dataset from the training data
dataset = build_memmap_dataset(training_config, training_config.data)

In [5]:
# # Build a memmap dataset from the validation data
# validation_datasets = []
# for evaluator in training_config.evaluators:
#     if evaluator.type == "lm":
#         evaluator.paths = None
#         dataset = build_memmap_dataset(training_config, evaluator.data)
#         validation_datasets.append(dataset)

In [None]:
# dataset = validation_datasets[0]

In [None]:
# number of tokens in the dataset
num_tokens = len(dataset) * training_config.model.max_sequence_length
num_tokens_billions = num_tokens / 1e9
print(f"Number of tokens in the dataset: {num_tokens_billions:.2f}B")
num_tokens_millions = num_tokens / 1e6
print(f"Number of tokens in the dataset: {num_tokens_millions:.2f}M")
num_tokens_thousands = num_tokens / 1e3
print(f"Number of tokens in the dataset: {num_tokens_thousands:.2f}k")

In [None]:
# Compute number of steps based on total number of tokens
num_tokens = num_tokens
num_tokens_billions = num_tokens / 1e9
print(f"Number of tokens in the dataset: {num_tokens_billions:.2f}B")
num_tokens_millions = num_tokens / 1e6
print(f"Number of tokens in the dataset: {num_tokens_millions:.2f}M")

max_sequence_length = training_config.model.max_sequence_length
# max_sequence_length = 512
batch_size = training_config.global_train_batch_size
# batch_size = 200

num_chunks = num_tokens // max_sequence_length # number of chunks of sequence_length tokens
num_steps = num_chunks // batch_size # how many batches we need to iterate over the chunks
print(f"Number of steps: {num_steps}")

In [None]:
# load an example from the dataset
example = dataset[0]
print(example)
input_ids = example["input_ids"]
print(input_ids.shape)

In [None]:
for example in dataset:
    print(example.keys())
    print(example["input_ids"][:10])
    break

In [None]:
# decode the input ids
example_text = tokenizer.decode(list(input_ids))
print(example_text)

---

In [70]:
# decode a dolma dataset

tokenizer = AutoTokenizer.from_pretrained("allenai/dolma2-tokenizer")
path = '/network/scratch/m/marius.mosbach/olmo/training_data/olmo2/slim_pajama/part-0-00000.npy'

size = os.path.getsize(path)
data = np.memmap(path, dtype='uint32', mode='r', shape=(size // 4,))

In [None]:
print(tokenizer.decode(data[:100]))

In [None]:
# load csv.gz from disk
path = "/network/scratch/m/marius.mosbach/olmo/training_data/olmo2/slim_pajama/part-0-00000.csv.gz"
columns = ["start", "end", "unkown", "path", "id"]
df = pd.read_csv(path, compression='gzip', names=columns)
df.head()

In [None]:
df[df["id"] == 1]

In [None]:
print(tokenizer.decode(data[233290:234051]))