In [1]:
import os
import zipfile
from datetime import datetime
from glob import glob
from tqdm import tqdm
from huggingface_hub import HfApi
from huggingface_hub import HfFileSystem
import time
from multiprocess import Pool
import itertools

partition_size = 5e+9

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

In [2]:
def loop(files):
    files, index = files
    current_index = 0
    api = HfApi()
    fs = HfFileSystem()
    total = 0
    temp = []
    for i in tqdm(range(len(files))):
        s = get_size(files[i])
        if s + total >= partition_size:
            part_name = f"dac-{index}-{current_index}.zip"
                
            with zipfile.ZipFile(part_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
                for folder in temp:
                    for root, dirs, files_ in os.walk(folder):
                        for f in files_:
                            f = os.path.join(root, f)
                            zipf.write(f, arcname=f)

            while True:
                try:
                    api.upload_file(
                        path_or_fileobj=part_name,
                        path_in_repo=part_name,
                        repo_id="mesolitica/Malaysian-Emilia-Audio-Tokens",
                        repo_type="dataset",
                    )
                    break
                except:
                    time.sleep(60)

            os.remove(part_name)
            
            current_index += 1
            temp = [files[i]]
            total = s
        else:
            temp.append(files[i])
            total += s
        
    if len(temp):
        part_name = f"dac-{index}-{current_index}.zip"

        with zipfile.ZipFile(part_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for folder in temp:
                for root, dirs, files_ in os.walk(folder):
                    for f in files_:
                        f = os.path.join(root, f)
                        zipf.write(f, arcname=f)

        while True:
            try:
                api.upload_file(
                    path_or_fileobj=part_name,
                    path_in_repo=part_name,
                    repo_id="mesolitica/Malaysian-Emilia-Audio-Tokens",
                    repo_type="dataset",
                )
                break
            except:
                time.sleep(60)

        os.remove(part_name)

In [3]:
api = HfApi()

In [4]:
folders = sorted(glob('*_dac/*'))

In [5]:
len(folders)

138812

In [6]:
multiprocessing(folders, loop, cores = 10, returned = False)

 66%|████████████████████████████████████████████████████▏                          | 9166/13881 [00:19<00:00, 12999.96it/s]

dac-8-0.zip:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

dac-1-0.zip:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

dac-4-0.zip:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

dac-2-0.zip:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

dac-5-0.zip:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

dac-9-0.zip:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

dac-3-0.zip:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

dac-0-0.zip:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

dac-7-0.zip:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

dac-6-0.zip:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [09:12<00:00, 25.12it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [09:25<00:00, 24.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [09:30<00:00, 24.34it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [09:30<00:00, 24.34it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [09:30<00:00, 24.32it/s]
 68%|███████████████████████████████████████████████████████▊                          | 9449/13881 [09:50<11:44,  6.29it/s]

dac-8-1.zip:   0%|          | 0.00/292M [00:00<?, ?B/s]

100%|███████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 7364.89it/s]


dac-10-0.zip:   0%|          | 0.00/242k [00:00<?, ?B/s]

dac-7-1.zip:   0%|          | 0.00/891M [00:00<?, ?B/s]

dac-6-1.zip:   0%|          | 0.00/899M [00:00<?, ?B/s]

dac-1-1.zip:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

dac-2-1.zip:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

dac-4-1.zip:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

dac-9-1.zip:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

dac-5-1.zip:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

dac-0-1.zip:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

dac-3-1.zip:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [18:13<00:00, 12.70it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [18:13<00:00, 12.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [18:13<00:00, 12.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [18:13<00:00, 12.69it/s]


dac-0-2.zip:   0%|          | 0.00/68.2M [00:00<?, ?B/s]

 72%|██████████████████████████████████████████████████████████▍                      | 10005/13881 [18:30<22:04,  2.93it/s]

dac-4-2.zip:   0%|          | 0.00/160M [00:00<?, ?B/s]

dac-3-2.zip:   0%|          | 0.00/581M [00:00<?, ?B/s]

dac-5-2.zip:   0%|          | 0.00/582M [00:00<?, ?B/s]

dac-9-2.zip:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

 81%|█████████████████████████████████████████████████████████████████▏               | 11175/13881 [26:50<14:03,  3.21it/s]

dac-9-3.zip:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

100%|█████████████████████████████████████████████████████████████████████████████████| 13881/13881 [34:44<00:00,  6.66it/s]


dac-9-4.zip:   0%|          | 0.00/1.23G [00:00<?, ?B/s]