In [1]:
import mp
import os
import pyarrow as pa
import numpy as np
from streaming import MDSWriter
from tqdm import tqdm

In [2]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class Int32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.int32)

_encodings['int32'] = Int32

In [3]:
columns = {
    'input_ids': 'int32',
}
compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [6]:
from glob import glob

files = glob('combine-lm_*_of_00030.jsonl-tokenized')
files = sorted(files)
files

['combine-lm_00000_of_00030.jsonl-tokenized',
 'combine-lm_00001_of_00030.jsonl-tokenized',
 'combine-lm_00002_of_00030.jsonl-tokenized',
 'combine-lm_00003_of_00030.jsonl-tokenized',
 'combine-lm_00004_of_00030.jsonl-tokenized',
 'combine-lm_00005_of_00030.jsonl-tokenized',
 'combine-lm_00006_of_00030.jsonl-tokenized',
 'combine-lm_00007_of_00030.jsonl-tokenized',
 'combine-lm_00008_of_00030.jsonl-tokenized',
 'combine-lm_00009_of_00030.jsonl-tokenized',
 'combine-lm_00010_of_00030.jsonl-tokenized',
 'combine-lm_00011_of_00030.jsonl-tokenized',
 'combine-lm_00012_of_00030.jsonl-tokenized',
 'combine-lm_00013_of_00030.jsonl-tokenized',
 'combine-lm_00014_of_00030.jsonl-tokenized',
 'combine-lm_00015_of_00030.jsonl-tokenized',
 'combine-lm_00016_of_00030.jsonl-tokenized',
 'combine-lm_00017_of_00030.jsonl-tokenized',
 'combine-lm_00018_of_00030.jsonl-tokenized',
 'combine-lm_00019_of_00030.jsonl-tokenized',
 'combine-lm_00020_of_00030.jsonl-tokenized',
 'combine-lm_00021_of_00030.jsonl-

In [7]:
def loop(files):
    files, index = files
    out_root = f'nanot5-{index}'
    os.system(f'rm -rf {out_root}')
    with MDSWriter(out=out_root, columns=columns, compression=compression, hashes=hashes, 
                   size_limit = 67108864 * 2) as out:
        for f in files:
            memory_mapped_stream = pa.memory_map(f)
            opened_stream = pa.ipc.open_stream(memory_mapped_stream)
            for a in tqdm(opened_stream):
                s = a.to_struct_array()
                for i in range(len(s)):
                    keys = list(s[i])
                    a_ = {}
                    for k in keys:
                        a_[k] = np.array(s[i][k].as_py()).astype(np.int32)
                    out.write(a_)

In [8]:
mp.multiprocessing(files, loop, cores = 30, returned = False)

4976it [00:57, 85.94it/s]
4976it [00:58, 85.71it/s]
4976it [01:10, 70.29it/s]
4976it [01:11, 69.36it/s]
4886it [01:11, 67.82it/s]
4976it [01:12, 68.86it/s]
4976it [01:12, 68.75it/s]
447it [01:12,  5.30it/s]]
2375it [01:13, 34.55it/s]
2108it [01:36, 18.61it/s]
4978it [02:25, 34.32it/s]
4976it [02:25, 34.29it/s]
4978it [02:26, 34.02it/s]
4978it [02:31, 32.77it/s]
4977it [02:32, 32.61it/s]
4976it [02:33, 32.50it/s]
4976it [02:49, 29.28it/s]
5012it [03:00, 27.81it/s]
4976it [03:03, 27.14it/s]
4976it [03:04, 26.92it/s]
4976it [03:12, 25.86it/s]
5010it [03:15, 25.63it/s]
4977it [03:21, 24.66it/s]
5022it [03:36, 23.18it/s]
5002it [05:05, 16.38it/s] 
5076it [05:54, 14.33it/s]
6080it [07:31, 13.46it/s]
6083it [13:22,  7.58it/s]
5702it [14:09,  6.72it/s]
7612it [16:25,  7.73it/s]
