# Custom Datasets

- [HuggingFace MagicBrush](https://huggingface.co/datasets/osunlp/MagicBrush)
- [HuggingFace AURORA](https://huggingface.co/datasets/McGill-NLP/AURORA)

In [1]:
!pip install pyarrow
!pip install tqdm

Collecting pyarrow
  Downloading pyarrow-18.1.0-cp39-cp39-manylinux_2_28_x86_64.whl (40.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-18.1.0


In [2]:
import pandas as pd
from PIL import Image, ImageFile
from tqdm import tqdm
import hashlib
import time
import os
import io

In [3]:
ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.LOAD_TRUNCATED_IMAGES = True

### How to get data

- Clone repository. e.g. MagicBrush: 'git clone https://huggingface.co/datasets/osunlp/MagicBrush'
    - Get files in parquet format (10+ files)
- Use code below to save images from parquet in folders, one folder for each parquet
- Columns of df containing the path to the images: img_id, turn_index, source_img (str path), target_img (str path), instruction
    - one df for train data (8,807)
    - one df for dev data (528)
- For training: use path in df as params

In [67]:
# path of parquet files
parquet_dir = '/home/jovyan/BA/Github/MagicBrush/data/'
#"/home/jovyan/BA/Github/AURORA/data" # AURORA
#'/home/jovyan/BA/Github/MagicBrush/data/' # MagicBrush

# path to save the images and dataframes
base_image_dir = '/home/jovyan/BA/Github/MagicBrush/'
#'/home/jovyan/BA/Github/AURORA/'
#'/home/jovyan/BA/Github/MagicBrush/'

os.makedirs(base_image_dir, exist_ok=True)

# init dataframes
#train_df = pd.DataFrame(columns=['source_img', 'target_img', 'mask_img', 'instruction'])
#dev_df = pd.DataFrame(columns=['source_img', 'target_img',  'mask_img', 'instruction'])

train_df = pd.DataFrame(columns=['mask_img', 'instruction'])
dev_df = pd.DataFrame(columns=['mask_img', 'instruction'])

In [68]:
def generate_unique_image_name(base_name, idx):
    unique_str = f"{base_name}_{idx}_{int(time.time()*1000)}"
    return hashlib.md5(unique_str.encode()).hexdigest() + ".png"

In [69]:
def process_image(image, save_dir, save=False): # AURORA: add input_name
    image_data = image['bytes']
    img = Image.open(io.BytesIO(image_data))
    img = img.resize((224, 224))
    
    save_path = os.path.join(save_dir, image['path']) #AURORA: only image
    #img.save(save_path)
    return save_path

In [71]:
def get_new_row(idx, row, parquet_image_dir):
    # MagicBrush
    #source_path = process_image(row['source_img'], parquet_image_dir)
    #target_path = process_image(row['target_img'], parquet_image_dir)
    mask_path = process_image(row['mask_img'], parquet_image_dir, save=True)


    """
    # AURORA
    unique_input_name = generate_unique_image_name("input", idx)
    unique_output_name = generate_unique_image_name("output", idx)

    source_path = process_image(row['input'], parquet_image_dir, unique_input_name)
    target_path = process_image(row['output'], parquet_image_dir, unique_output_name)

    source_img_path = source_path
    target_img_path = target_path
    """

# Create new row with required information (MagicBrush)
    new_row = {
        'img_id': row['img_id'],
        'turn_index': row['turn_index'],
        #'source_img': source_path,
        #'target_img': target_path,
        'mask_img': mask_path,
        'instruction': row['instruction']
    }
    return new_row

"""
    # AURORA
    new_row = {
        'input': source_img_path,
        'output': target_img_path,
        'instruction': row['instruction']
    }
    return new_row
"""

"\n    # AURORA\n    new_row = {\n        'input': source_img_path,\n        'output': target_img_path,\n        'instruction': row['instruction']\n    }\n    return new_row\n"

In [None]:
# loop over all parquet files

for filename in os.listdir(parquet_dir):
    if filename.endswith(".parquet"):
        file_path = os.path.join(parquet_dir, filename)
        temp_df = pd.read_parquet(file_path)
        
        parquet_image_dir = os.path.join(base_image_dir, os.path.splitext(filename)[0])
        os.makedirs(parquet_image_dir, exist_ok=True)
        
        """ Only for MagicBrush: filter out mask images, as we do not need them
        if 'mask_img' in temp_df.columns: 
            temp_df = temp_df.drop(columns=['mask_img'])
        """

        # MagicBrush
        temp_processed_df = pd.DataFrame(columns=['img_id', 'turn_index', 'mask_img','instruction'])
        
        # AURORA
        #temp_processed_df = pd.DataFrame(columns=['input', 'output', 'instruction'])
        
        processed_rows = []
        for idx, row in tqdm(temp_df.iterrows(), total=len(temp_df), desc="Processing rows"):
            new_row = get_new_row(idx, row, parquet_image_dir)
            processed_rows.append(new_row)
        temp_processed_df = pd.concat([temp_processed_df, pd.DataFrame(processed_rows)], ignore_index=True)

        if filename.startswith("train"):
            train_df = pd.concat([train_df, temp_processed_df])

        if filename.startswith("dev"):
            dev_df = pd.concat([dev_df, temp_processed_df])

train_df.to_csv(os.path.join(base_image_dir, 'train_data_mask_111.csv'), index=False)
dev_df.to_csv(os.path.join(base_image_dir, 'dev_data_mask_111.csv'), index=False)

print("Images processed and dataframes saved!")

Processing rows: 100%|██████████| 173/173 [00:03<00:00, 44.16it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 44.96it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 43.40it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 44.21it/s]
Processing rows: 100%|██████████| 172/172 [00:03<00:00, 44.84it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 44.67it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 43.41it/s]
Processing rows: 100%|██████████| 172/172 [00:03<00:00, 44.54it/s]
Processing rows: 100%|██████████| 132/132 [00:02<00:00, 45.29it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 44.12it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 43.88it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 44.82it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 43.45it/s]
Processing rows: 100%|██████████| 173/173 [00:03<00:00, 44.22it/s]
Processing rows: 100%|██████████| 132/132 [00:02<00:00, 44.51i

In [None]:
print(1)