In [34]:
import json
import os
import pandas as pd
from glob import glob
from PIL import Image
from io import BytesIO
from huggingface_hub import snapshot_download
from collections import defaultdict
import mp

In [10]:
data = defaultdict(list)
with open('dedup-0.6.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        data[l['filename']].extend(l['selected_indices'])


In [4]:
!mkdir image-location

mkdir: cannot create directory ‘image-location’: File exists


In [5]:
folder = snapshot_download(repo_id='malaysia-ai/crawl-google-image-malaysia-location', repo_type = 'dataset')

Fetching 1002 files:   0%|          | 0/1002 [00:00<?, ?it/s]

In [6]:
df = pd.read_parquet(os.path.join(folder, 'data', data[0]['filename']))

In [37]:
from tqdm import tqdm

def loop(files):
    files, index = files
    results = []
    for f in tqdm(files):
        f_ = os.path.split(f)[1]
        df = pd.read_parquet(f)
        for i in range(len(data[f_])):
            
            index = data[f_][i]
            filename = os.path.join('image-location', f'{f_}-{index}.jpg')
            results.append({
                'filename': filename,
                'filename_df': f_,
                'keyword': df.iloc[index]['keyword'],
                'alt_text': df.iloc[index]['alt_text'],
            })
            if os.path.exists(filename):
                continue
            stream = BytesIO(df.iloc[index]['image']['bytes'])
            image = Image.open(stream)
            image.save(filename, format = 'jpeg')
    return results

In [32]:
files = glob(f'{folder}/data/*.parquet')
len(files)

1000

In [38]:
r = loop((files[:1], 0))

100%|██████████| 1/1 [00:00<00:00,  1.48it/s]


In [40]:
r = mp.multiprocessing(files, loop, cores = 20)

100%|██████████| 50/50 [32:59<00:00, 39.60s/it]
100%|██████████| 50/50 [33:07<00:00, 39.75s/it]
100%|██████████| 50/50 [33:10<00:00, 39.81s/it]
100%|██████████| 50/50 [33:14<00:00, 39.90s/it]
100%|██████████| 50/50 [33:17<00:00, 39.95s/it]
100%|██████████| 50/50 [33:28<00:00, 40.17s/it]
100%|██████████| 50/50 [33:29<00:00, 40.18s/it]
100%|██████████| 50/50 [33:39<00:00, 40.39s/it]
100%|██████████| 50/50 [33:41<00:00, 40.42s/it]
100%|██████████| 50/50 [33:41<00:00, 40.43s/it]
100%|██████████| 50/50 [33:44<00:00, 40.50s/it]
100%|██████████| 50/50 [33:52<00:00, 40.66s/it]
100%|██████████| 50/50 [33:59<00:00, 40.79s/it]
100%|██████████| 50/50 [34:07<00:00, 40.96s/it]
100%|██████████| 50/50 [34:09<00:00, 40.98s/it]
100%|██████████| 50/50 [34:10<00:00, 41.02s/it]
100%|██████████| 50/50 [34:13<00:00, 41.08s/it]
100%|██████████| 50/50 [34:20<00:00, 41.21s/it]
100%|██████████| 50/50 [34:20<00:00, 41.22s/it]
100%|██████████| 50/50 [34:30<00:00, 41.40s/it]


In [54]:
len(r)

487301

In [55]:
with open('save-image.jsonl', 'w') as fopen:
    for r_ in r:
        fopen.write(f'{json.dumps(r_)}\n')

In [56]:
!tail -n 3 save-image.jsonl

{"filename": "image-location/train-00817-of-01000.parquet-3017.jpg", "filename_df": "train-00817-of-01000.parquet", "keyword": "Taman Merbah, Olak Lempit Banting", "alt_text": "NADIAZ HOMESTAY OLAK LEMPIT,BANTING ..."}
{"filename": "image-location/train-00817-of-01000.parquet-3024.jpg", "filename_df": "train-00817-of-01000.parquet", "keyword": "Taman Merbah, Olak Lempit Banting", "alt_text": "Rasvilla - Google hotels"}
{"filename": "image-location/train-00817-of-01000.parquet-3054.jpg", "filename_df": "train-00817-of-01000.parquet", "keyword": "Taman Merbah, Olak Lempit Banting", "alt_text": "Terrace For Sale in Kampung Olak Lempit ..."}


In [58]:
!du -hs image-location

61G	image-location


In [59]:
# !sudo apt install zip -y

In [61]:
# !zip -q -r -s 10000m image.zip image-location

In [65]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_file(
    path_or_fileobj='save-image.jsonl',
    path_in_repo='save-image.jsonl',
    repo_id='mesolitica/google-image-malaysia-location-dedup',
    repo_type='dataset',
)

save-image.jsonl:   0%|          | 0.00/102M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/google-image-malaysia-location-dedup/commit/b308277c77102184c45f7dc733e46a4273420405', commit_message='Upload save-image.jsonl with huggingface_hub', commit_description='', oid='b308277c77102184c45f7dc733e46a4273420405', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
for f in glob('image.z*'):
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id='mesolitica/google-image-malaysia-location-dedup',
        repo_type='dataset',
    )

image.z02
