In [214]:
import datasets
import numpy as np
from tqdm import tqdm
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [199]:
ds = datasets.load_dataset("naraca/mi-dataset-activaciones-llama3_2")['train']
ds.set_format('numpy') #underlying data is fp16, storedd as fp32
# using np allows us to to have fp32 in memory, using the default format would
# imply using fp64 (python's default), so we will use 'numpy'. Let's note
# that this generates np arrays of 1 axis of np arrays of 2 axis
# instead of np arrays of 3 axis when using map with batched = True.
# not ideal but better than the alternatives
ds

Dataset({
    features: ['activacion'],
    num_rows: 1000
})

In [200]:
def explode(batch):
    batch = [row for mat in batch['activacion'] for row in mat]
    return {'activacion': batch}

In [202]:
ds = ds.map(
    explode,
    batched=True,
    batch_size=10,
    remove_columns=['activacion']
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [203]:
d_model = 2048
bytes_per_float = 2 # llama uses fp16

bytes_per_GibiByte = 2**(10*3)
desired_size_in_bytes = bytes_per_GibiByte // 2
activation_vectors_per_file = desired_size_in_bytes // (d_model * bytes_per_float)

ds = ds.batch(activation_vectors_per_file)

Batching examples:   0%|          | 0/668501 [00:00<?, ? examples/s]

In [204]:
desired_size_in_bytes

536870912

In [None]:
for i, arr in enumerate(tqdm(ds)):
    # tried to convert to float16 in explode() but here these activations were
    # detected as float32!
    flat = pa.array(arr['activacion'].astype(np.float16).ravel(), type=pa.float16())
    vec_col = pa.FixedSizeListArray.from_arrays(flat, 2048)
    table = pa.table({'activations': vec_col})
    pq.write_table(
        table,
        f"/workspace/local-dataset/data/activations{i}.parquet",
        compression="zstd",
        data_page_size=1024*1024,
        row_group_size=8192,
    )

100%|██████████| 6/6 [01:13<00:00, 12.21s/it]
