In [1]:
import json
import os
import pandas as pd
from glob import glob
from PIL import Image
from io import BytesIO
from huggingface_hub import snapshot_download
from collections import defaultdict
import mp

In [2]:
data = defaultdict(list)
with open('dedup-0.7.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        data[l['filename']].extend(l['selected_indices'])


In [3]:
!mkdir image-vehicle

mkdir: cannot create directory ‘image-vehicle’: File exists


In [4]:
folder = snapshot_download(repo_id='malaysia-ai/crawl-google-image-malaysian-vehicle', repo_type = 'dataset')

Fetching 167 files:   0%|          | 0/167 [00:00<?, ?it/s]

In [5]:
from tqdm import tqdm

def loop(files):
    files, index = files
    results = []
    for f in tqdm(files):
        f_ = os.path.split(f)[1]
        df = pd.read_parquet(f)
        for i in range(len(data[f_])):
            
            index = data[f_][i]
            filename = os.path.join('image-vehicle', f'{f_}-{index}.jpg')
            results.append({
                'filename': filename,
                'filename_df': f_,
                'keyword': df.iloc[index]['keyword'],
                'alt_text': df.iloc[index]['alt_text'],
            })
            if os.path.exists(filename):
                continue
            stream = BytesIO(df.iloc[index]['image']['bytes'])
            image = Image.open(stream)
            image.save(filename, format = 'jpeg')
    return results

In [6]:
files = glob(f'{folder}/data/*.parquet')
len(files)

165

In [7]:
r = loop((files[:1], 0))

100%|██████████| 1/1 [00:02<00:00,  2.97s/it]


In [8]:
r = mp.multiprocessing(files, loop, cores = 10)

100%|██████████| 16/16 [06:50<00:00, 25.63s/it]
100%|██████████| 16/16 [07:02<00:00, 26.38s/it]
100%|██████████| 16/16 [07:04<00:00, 26.54s/it]
100%|██████████| 16/16 [07:04<00:00, 26.55s/it]
100%|██████████| 16/16 [07:08<00:00, 26.76s/it]
100%|██████████| 16/16 [07:12<00:00, 27.01s/it]
100%|██████████| 16/16 [07:12<00:00, 27.02s/it]
100%|██████████| 16/16 [07:14<00:00, 27.18s/it]
100%|██████████| 16/16 [07:17<00:00, 27.36s/it]
100%|██████████| 16/16 [07:18<00:00, 27.39s/it]
100%|██████████| 5/5 [00:36<00:00,  7.25s/it]


In [9]:
len(r)

97598

In [10]:
with open('save-image-vehicle.jsonl', 'w') as fopen:
    for r_ in r:
        fopen.write(f'{json.dumps(r_)}\n')

In [11]:
!tail -n 3 save-image-vehicle.jsonl

{"filename": "image-vehicle/train-00031-of-00165-470a596d9d15c78c.parquet-477.jpg", "filename_df": "train-00031-of-00165-470a596d9d15c78c.parquet", "keyword": "2020 Chevrolet Spark LS AUTOMATIC", "alt_text": "2020 Chevrolet Spark Hatchback 5D LS ..."}
{"filename": "image-vehicle/train-00031-of-00165-470a596d9d15c78c.parquet-479.jpg", "filename_df": "train-00031-of-00165-470a596d9d15c78c.parquet", "keyword": "2020 Chevrolet Spark LS AUTOMATIC", "alt_text": "New & Used Chevrolet Spark for Sale ..."}
{"filename": "image-vehicle/train-00031-of-00165-470a596d9d15c78c.parquet-497.jpg", "filename_df": "train-00031-of-00165-470a596d9d15c78c.parquet", "keyword": "2020 Chevrolet Spark LS AUTOMATIC", "alt_text": "2020 CHEVROLET SPARK FWD LS AUTOMATIC ..."}


In [12]:
!du -hs image-vehicle

11G	image-vehicle


In [14]:
# !sudo apt install zip -y

In [19]:
!zip -q -r -s 5000m image-vehicle.zip image-vehicle

In [18]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_file(
    path_or_fileobj='save-image-vehicle.jsonl',
    path_in_repo='save-image-vehicle.jsonl',
    repo_id='mesolitica/google-image-malaysian-vehicle-dedup',
    repo_type='dataset',
)

save-image-vehicle.jsonl:   0%|          | 0.00/24.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/google-image-malaysian-vehicle-dedup/commit/d2117b767f85602e86788a65e1727b6bbb8d7fdb', commit_message='Upload save-image-vehicle.jsonl with huggingface_hub', commit_description='', oid='d2117b767f85602e86788a65e1727b6bbb8d7fdb', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
for f in glob('image-vehicle.z*'):
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id='mesolitica/google-image-malaysian-vehicle-dedup',
        repo_type='dataset',
    )

image-vehicle.z02


image-vehicle.z02:   0%|          | 0.00/5.24G [00:00<?, ?B/s]

image-vehicle.zip


image-vehicle.zip:   0%|          | 0.00/412M [00:00<?, ?B/s]

image-vehicle.z01


image-vehicle.z01:   0%|          | 0.00/5.24G [00:00<?, ?B/s]