In [1]:
from glob import glob
import json

In [2]:
files = glob('output-malaysia-vehicle/*.json')
len(files)

97287

In [3]:
with open(files[100]) as fopen:
    d = json.load(fopen)

In [5]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
import os

class Float32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.float32)

_encodings['float32'] = Float32

columns = {
    'embedding': 'float32',
    'filename': 'str',
    'index': 'int'
}
hashes = 'sha1', 'xxh64'

In [6]:
!mkdir embedding-vehicle

In [12]:
def loop(files):
    files, index = files
    out_root = f'embedding-vehicle/embedding-{index}'
    os.system(f'rm -rf {out_root}')
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for f in tqdm(files):
            with open(f) as fopen:
                try:
                    data = json.load(fopen)
                except:
                    continue
                for row in data:
                    row['embedding'] = np.array(row['embedding'], dtype = np.float32)
                    out.write(row)

In [13]:
loop((files[:2], 0))

100%|██████████| 2/2 [00:00<00:00, 778.24it/s]


In [14]:
import mp

mp.multiprocessing(files, loop, cores = min(len(files), 30), returned = False)

100%|██████████| 3242/3242 [01:04<00:00, 50.13it/s]  
100%|██████████| 27/27 [00:00<00:00, 1018.47it/s]s]
100%|██████████| 3242/3242 [01:17<00:00, 42.05it/s]
100%|██████████| 3242/3242 [01:19<00:00, 40.72it/s]
100%|██████████| 3242/3242 [01:24<00:00, 38.56it/s]
100%|██████████| 3242/3242 [01:24<00:00, 38.49it/s]
100%|██████████| 3242/3242 [01:25<00:00, 37.71it/s]
100%|██████████| 3242/3242 [01:26<00:00, 37.58it/s]
100%|██████████| 3242/3242 [01:26<00:00, 37.47it/s]
100%|██████████| 3242/3242 [01:26<00:00, 37.31it/s]
100%|██████████| 3242/3242 [01:28<00:00, 36.77it/s]
100%|██████████| 3242/3242 [01:28<00:00, 36.72it/s]
100%|██████████| 3242/3242 [01:28<00:00, 36.77it/s]
100%|██████████| 3242/3242 [01:28<00:00, 36.74it/s]
100%|██████████| 3242/3242 [01:29<00:00, 36.34it/s]
100%|██████████| 3242/3242 [01:29<00:00, 36.31it/s]
100%|██████████| 3242/3242 [01:29<00:00, 36.21it/s]
100%|██████████| 3242/3242 [01:29<00:00, 36.13it/s]
100%|██████████| 3242/3242 [01:30<00:00, 35.71it/s]
100%|█████

In [15]:
folders = sorted(glob('embedding-vehicle/embedding-*'), key = lambda x: int(x.split('-')[-1]))
len(folders)

31

In [16]:
!rm -rf combine

In [17]:
with MDSWriter(out='combine', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████| 25800/25800 [00:01<00:00, 21773.72it/s]
100%|██████████| 25871/25871 [00:01<00:00, 22363.53it/s]
100%|██████████| 25839/25839 [00:01<00:00, 21800.75it/s]
100%|██████████| 25816/25816 [00:01<00:00, 20927.47it/s]
100%|██████████| 25888/25888 [00:01<00:00, 22818.10it/s]
100%|██████████| 25832/25832 [00:01<00:00, 22548.79it/s]
100%|██████████| 25864/25864 [00:01<00:00, 22658.14it/s]
100%|██████████| 25856/25856 [00:01<00:00, 20940.21it/s]
100%|██████████| 25864/25864 [00:01<00:00, 23226.82it/s]
100%|██████████| 25848/25848 [00:01<00:00, 23652.96it/s]
100%|██████████| 25800/25800 [00:01<00:00, 23527.09it/s]
100%|██████████| 25816/25816 [00:01<00:00, 21205.74it/s]
100%|██████████| 25864/25864 [00:01<00:00, 24247.13it/s]
100%|██████████| 25856/25856 [00:01<00:00, 24434.63it/s]
100%|██████████| 25832/25832 [00:01<00:00, 23070.21it/s]
100%|██████████| 25864/25864 [00:01<00:00, 22402.73it/s]
100%|██████████| 25880/25880 [00:01<00:00, 25479.93it/s]
100%|██████████| 25872/25872 [0

In [18]:
dataset = LocalDataset('combine')

In [19]:
len(dataset)

775637

In [20]:
dataset[100000]['filename']

'/home/ubuntu/.cache/huggingface/hub/datasets--malaysia-ai--crawl-google-image-malaysian-vehicle/snapshots/ef336aad83e1410dc55d0abb3e40be8e39b0838e/data/train-00002-of-00165-792f4cc576d400c7.parquet'

In [21]:
mapping = {}
for i in tqdm(range(len(dataset))):
    f = os.path.split(dataset[i]['filename'])[1]
    key = f"{f}-{dataset[i]['index']}"
    mapping[key] = i

100%|██████████| 775637/775637 [00:57<00:00, 13557.39it/s]


In [22]:
len(mapping)

775637

In [23]:
with open('mapping-index-vehicle.json', 'w') as fopen:
    json.dump(mapping, fopen)

In [24]:
from huggingface_hub import HfApi
api = HfApi()

In [25]:
api.upload_folder(
    folder_path='combine',
    path_in_repo='embedding',
    repo_id='mesolitica/google-image-malaysian-vehicle-dedup',
    repo_type='dataset',
)

shard.00001.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00004.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00002.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00003.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00000.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Upload 39 LFS files:   0%|          | 0/39 [00:00<?, ?it/s]

shard.00005.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00006.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00007.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00008.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00009.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00010.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00011.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00012.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00013.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00014.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00015.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00016.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00017.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00018.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00019.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00020.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00021.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00022.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00023.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00024.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00025.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00026.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00027.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00028.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00029.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00030.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00031.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00032.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00033.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00034.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00035.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00036.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00037.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00038.mds:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/google-image-malaysian-vehicle-dedup/commit/a89f3e56ac35a898019776453c4041f1f8cf7af2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='a89f3e56ac35a898019776453c4041f1f8cf7af2', pr_url=None, pr_revision=None, pr_num=None)

In [26]:
api.upload_file(
    path_or_fileobj='mapping-index-vehicle.json',
    path_in_repo='mapping-index-vehicle.json',
    repo_id='mesolitica/google-image-malaysian-vehicle-dedup',
    repo_type='dataset',
)

mapping-index-vehicle.json:   0%|          | 0.00/47.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/google-image-malaysian-vehicle-dedup/commit/4fcc338a5f87f9c910e9cb4e65057317d948d24d', commit_message='Upload mapping-index-vehicle.json with huggingface_hub', commit_description='', oid='4fcc338a5f87f9c910e9cb4e65057317d948d24d', pr_url=None, pr_revision=None, pr_num=None)