# Preprocess Datasets

See DATASET.md for more details.

Installing required libraries.

In [None]:
!pip install opencv-python -q
!pip install pandas -q
!pip install imageio-ffmpeg -q
!pip install fastparquet -q
!pip install matplotlib --user -q
!pip install numpy==1.26.4 -q
!pip install pyarrow --upgrade -q

In [3]:
import cv2
import os
import pandas as pd
import numpy as np
import io
import random
from PIL import Image
from PIL import Image as PILImage
from IPython.display import display
from tqdm import tqdm

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Dataset options include *MagicBrush* (used for fine-tuning OpenCLIP and correlation evaluation of fine-tuned ViFi-CLIP) and *HumanEdit* (utilized for fine-tuning ViFi-CLIP).

In [4]:
dataset = "MagicBrush"  # 'HumanEdit'
# dataset = os.getenv('DATASET')

if dataset not in ["MagicBrush", "HumanEdit"]:
    print(
        f"Invalid dataset specified: '{dataset}'. Choose from: 'MagicBrush', 'HumanEdit'."
    )

dataset_path = dataset.lower()

Read parquet file format.

In [5]:
if dataset == "HumanEdit":
    file_path = (
        "humanedit_test/HumanEdit/data/train-00000-of-00034-d4bc386d6caf62fb.parquet"
    )
elif dataset == "MagicBrush":
    file_path = (
        "magicbrush_test/MagicBrush/data/train-00016-of-00051-2ce592a2aa4c0b2d.parquet"
    )

In [3]:
# HumanEdit
df = pd.read_parquet(file_path, engine="pyarrow")
print(df.dtypes)

IMAGE_ID                   object
EDITING_TYPE               object
CORE                        int32
MASK                        int32
EDITING_INSTRUCTION        object
OUTPUT_DESCRIPTION         object
INPUT_CAPTION_BY_LLAMA     object
OUTPUT_CAPTION_BY_LLAMA    object
INPUT_IMG                  object
MASK_IMG                   object
OUTPUT_IMG                 object
dtype: object


In [6]:
# MagicBrush
df = pd.read_parquet(file_path, engine="pyarrow")
print(df.dtypes)

img_id         object
turn_index      int32
source_img     object
mask_img       object
instruction    object
target_img     object
dtype: object


Specify relevant columns.

In [7]:
if dataset == "HumanEdit":
    input_img_column = "INPUT_IMG"
    output_img_column = "OUTPUT_IMG"
    instr_column = "EDITING_INSTRUCTION"
    image_id_column = "IMAGE_ID"

elif dataset == "MagicBrush":
    input_img_column = "source_img"
    output_img_column = "target_img"
    instr_column = "instruction"
    image_id_column = "unique_id"  # will be created in the next section

Read all parquet files into a single dataframe.

In [8]:
data_dir = f"{dataset_path}/{dataset}/data"

parquet_files = [f for f in os.listdir(data_dir) if f.endswith(".parquet")]

df_list = []
for file in tqdm(parquet_files, "Processing parquet files"):
    file_path = os.path.join(data_dir, file)
    df = pd.read_parquet(file_path)
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)

Processing parquet files: 100%|██████████| 55/55 [03:55<00:00,  4.28s/it]


In [10]:
if dataset == "MagicBrush":
    combined_df["unique_id"] = (
        combined_df["img_id"].astype(str) + "_" + combined_df["turn_index"].astype(str)
    )

In [6]:
# HumanEdit
combined_df.head()

Unnamed: 0,IMAGE_ID,EDITING_TYPE,CORE,MASK,EDITING_INSTRUCTION,OUTPUT_DESCRIPTION,INPUT_CAPTION_BY_LLAMA,OUTPUT_CAPTION_BY_LLAMA,INPUT_IMG,MASK_IMG,OUTPUT_IMG
0,44977,Counting,0,0,The number of parachutes reduces to one.,"There is a red parachute floating in the sky, ...",The image depicts a vibrant scene of kites soa...,The image depicts a serene scene of people fly...,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
1,48665,Counting,0,0,The number of oranges reduces to two.,There are two oranges and a banana on the table,The image depicts a banana and three oranges a...,"A fruit arrangement on a wooden table, featuri...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
2,50827,Counting,0,0,The quantity of pizza reduces to two pieces.,Two white plates with two slices of pizza on a...,"Pizza, the ultimate comfort food.","Two slices of pizza on paper plates, viewed fr...",{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
3,53892,Counting,0,0,The number of cakes reduces to four.,Four cakes are placed on the yellow paper,"The image shows a box of donuts with a blue ""V...",The image shows a close-up view of four square...,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
4,56624,Counting,0,0,The number of trees changes from three to one.,The bench is covered with snow and there is a ...,"The image depicts a serene winter scene, with ...",A serene winter scene unfolds in a snow-covere...,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...


In [11]:
# MagicBrush
combined_df.head()

Unnamed: 0,img_id,turn_index,source_img,mask_img,instruction,target_img,unique_id
0,237608,2,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,make flames coming out of the fireplace,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,237608_2
1,237608,3,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,have a dog wait at the door,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,237608_3
2,125472,1,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,Have the person be holding a guitar,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,125472_1
3,32816,1,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,get rid of the notebooks,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,32816_1
4,32816,2,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,change the books to knickknacks,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,32816_2


### Create 2-frame Videos out of Images
ViFi-CLIP architecture requires video inputs rather than image inputs. To this end, the image pairs are transformed in a video with two frames.

In [12]:
def create_video(row, input_img_column, output_img_column, image_id_column):
    input_img_bytes = row[input_img_column].get("bytes", None)
    output_img_bytes = row[output_img_column].get("bytes", None)
    edit_id = row[image_id_column]

    input_img = cv2.imdecode(np.frombuffer(input_img_bytes, np.uint8), cv2.IMREAD_COLOR)
    output_img = cv2.imdecode(
        np.frombuffer(output_img_bytes, np.uint8), cv2.IMREAD_COLOR
    )

    input_img = cv2.resize(input_img, (224, 224))
    output_img = cv2.resize(output_img, (224, 224))

    video_path = f"{dataset_path}/videos/{edit_id}.mp4"

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    fps = 1
    height, width, _ = input_img.shape
    out = cv2.VideoWriter(video_path, fourcc, fps, (width, height))

    out.write(input_img)
    out.write(output_img)
    out.release()

In [13]:
for idx, row in tqdm(combined_df.iterrows(), "Creating videos"):
    create_video(row, input_img_column, output_img_column, image_id_column)

print("Videos created!")

Creating videos: 9335it [05:21, 29.03it/s]

Videos created!





### Create .csv File with Instruction Classes
Fine-tuning of ViFi-CLIP requires a .csv file of unique classes and their ids. As *instruction-guided image editing* datasets usually contain instructions instead of classes, those are assigned to ids. 

In [14]:
# unique editing instructions
editing_instructions = combined_df[instr_column].unique()

In [18]:
# HumanEdit
editing_instructions

array(['The number of parachutes reduces to one.',
       'The number of oranges reduces to two.',
       'The quantity of pizza reduces to two pieces.', ...,
       'The girl has one hand on the waist.',
       'The sika deer twist their heads.', 'A lady looks up at the sky.'],
      dtype=object)

In [15]:
# MagicBrush
editing_instructions

array(['make flames coming out of the fireplace',
       'have a dog wait at the door',
       'Have the person be holding a guitar', ...,
       "There should be a bird on the bear's back.",
       'change the cat into an owl',
       'change the glass of water into a glass of cola'], dtype=object)

In [16]:
# create csv file for instructions with corresponding class labels
instructions_df = pd.DataFrame(
    {
        "id": range(len(editing_instructions)),
        "name": [instr.lower().replace(".", "") for instr in editing_instructions],
    }
)

In [20]:
# HumanEdit
instructions_df

Unnamed: 0,id,name
0,0,the number of parachutes reduces to one
1,1,the number of oranges reduces to two
2,2,the quantity of pizza reduces to two pieces
3,3,the number of cakes reduces to four
4,4,the number of trees changes from three to one
...,...,...
4529,4529,put the girl's left hand down
4530,4530,tilt the girl's head to look to the side
4531,4531,the girl has one hand on the waist
4532,4532,the sika deer twist their heads


In [17]:
# MagicBrush
instructions_df

Unnamed: 0,id,name
0,0,make flames coming out of the fireplace
1,1,have a dog wait at the door
2,2,have the person be holding a guitar
3,3,get rid of the notebooks
4,4,change the books to knickknacks
...,...,...
8788,8788,let the sign board be red
8789,8789,only one bear should be visible
8790,8790,there should be a bird on the bear's back
8791,8791,change the cat into an owl


In [18]:
instructions_df.to_csv(f"{dataset_path}/labels.csv", index=False)

### Create .txt Files for Train / Test Split
ViFi-CLIP also requires two .txt files with corresponding data for training and testing. Each line contains the video identifier (e.g., *vvpaOa08GGY.mp4*) along with the instruction id.

The data is being shuffled (seed=42) prior to dividing it into test (20%) and train (80%) splits. 

In [19]:
labels_df = pd.read_csv(f"{dataset_path}/labels.csv")

In [20]:
combined_df[image_id_column] = combined_df[image_id_column].str.replace(
    r"\s", "", regex=True
)  # remove space
combined_df[image_id_column] = combined_df[image_id_column].str.replace(
    r"\(\d+\)", lambda m: m.group(0).replace(" ", ""), regex=True
)  # one image name contains a faulty blank character
combined_df["VIDEO_NAME"] = combined_df[image_id_column] + ".mp4"

combined_df["EDITING_INSTRUCTION_CLEAN"] = (
    combined_df[instr_column].str.lower().str.replace(".", "", regex=False)
)  # lowercase instructions and remove sentence points
combined_df = combined_df.merge(
    labels_df, how="left", left_on="EDITING_INSTRUCTION_CLEAN", right_on="name"
)

# shuffle data (seed 42)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# split in test (20%) and train (80%)
train_size = int(0.8 * len(combined_df))
train_df = combined_df.iloc[:train_size]
test_df = combined_df.iloc[train_size:]

with open(f"{dataset_path}/train.txt", "w") as train_file:
    for _, row in train_df.iterrows():
        train_file.write(f"{row['VIDEO_NAME']} {row['id']}\n")

with open(f"{dataset_path}/test.txt", "w") as test_file:
    for _, row in test_df.iterrows():
        test_file.write(f"{row['VIDEO_NAME']} {row['id']}\n")

print("Files train.txt und test.txt were created successfully.")

Files train.txt und test.txt were created successfully.


### Create Dataframes for OpenCLIP (ONLY MagicBrush)

Fine-tuning Vanilla CLIP through image fusion, as conducted during the preliminary studies, requires the dataset to be in a different format. Instead of the elements mentioned above, it requires .csv files containing the following fields: id, turn, instruction, and directory paths to the input and output images.


In [None]:
train_df = pd.DataFrame(
    columns=["img_id", "turn_index", "source_img", "target_img", "instruction"]
)
dev_df = pd.DataFrame(
    columns=["img_id", "turn_index", "source_img", "target_img", "instruction"]
)

# read bytes, resize to 512x512, and save image
# imgage of class 'dict': ['bytes', 'path']


def process_image(image, save_dir):
    image_data = image["bytes"]
    img = Image.open(io.BytesIO(image_data))
    img = img.resize((512, 512))

    save_path = os.path.join(save_dir, image["path"])
    img.save(save_path)
    return save_path


parquet_path = f"{dataset_path}/images"

# loop over all parquet files
for filename in tqdm(os.listdir(parquet_path), "Process parquet files"):
    if filename.endswith(".parquet"):
        file_path = os.path.join(parquet_path, filename)
        temp_df = pd.read_parquet(file_path)

        # create a dic for the images related to this parquet file
        parquet_image_dir = os.path.join(parquet_path, os.path.splitext(filename)[0])
        os.makedirs(parquet_image_dir, exist_ok=True)
        # os.makedirs(os.path.splitext(filename)[0], exist_ok=True)

        # filter out masks
        if "mask_img" in temp_df.columns:
            temp_df = temp_df.drop(columns=["mask_img"])

        # prepare a temporary dataframe to store the necessary columns
        temp_processed_df = pd.DataFrame(
            columns=["img_id", "turn_index", "source_img", "target_img", "instruction"]
        )

        processed_rows = []
        for idx, row in temp_df.iterrows():
            if (
                "img_id" in row
                and "turn_index" in row
                and "source_img" in row
                and "target_img" in row
                and "instruction" in row
            ):

                source_path = process_image(row["source_img"], parquet_image_dir)
                source_img_path = source_path

                target_path = process_image(row["target_img"], parquet_image_dir)
                target_img_path = target_path

                # create a new row with the required columns
                new_row = {
                    "img_id": row["img_id"],
                    "turn_index": row["turn_index"],
                    "source_img": source_img_path,
                    "target_img": target_img_path,
                    "instruction": row["instruction"],
                }
                processed_rows.append(new_row)

        temp_processed_df = pd.concat(
            [temp_processed_df, pd.DataFrame(processed_rows)], ignore_index=True
        )

        # append to corresponding df
        if filename.startswith("train"):
            train_df = pd.concat([train_df, temp_processed_df])
        elif filename.startswith("dev"):
            dev_df = pd.concat([dev_df, temp_processed_df])


if dataset == "MagicBrush":
    train_df.to_csv(f"{dataset_path}/train_data.csv", index=False)
    dev_df.to_csv(f"{dataset_path}/dev_data.csv", index=False)

    print("Images processed and DataFrames saved successfully!")

### Generate 8-frame videos (alternate)

Different configurations were tested, including the amount of frames per video. As [research](https://arxiv.org/pdf/2104.08860) suggests that the number of frames should be minimum 6, or optimally 8, additional intermediate frames are generated via alpha blending.

In [25]:
input_dir = f"{dataset_path}/videos"
output_dir = f"{dataset_path}/videos_8_frames"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# alpha blending


def alpha_blend(frame1, frame2, alpha):
    return cv2.addWeighted(frame1, 1 - alpha, frame2, alpha, 0)


def process_video(video_path, output_path):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return

    ret, first_frame = cap.read()  # First frame
    cap.set(cv2.CAP_PROP_POS_FRAMES, cap.get(cv2.CAP_PROP_FRAME_COUNT) - 1)
    ret, last_frame = cap.read()  # Last frame

    first_frame = np.float32(first_frame)
    last_frame = np.float32(last_frame)

    # generate 6 intermediate frames
    frames = [first_frame]  # first frame
    for i in range(1, 7):
        alpha = i / 8.0
        blended_frame = alpha_blend(first_frame, last_frame, alpha)
        frames.append(np.uint8(blended_frame))

    frames.append(last_frame)  # last frame

    height, width, _ = first_frame.shape
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_path, fourcc, 30, (width, height))

    for frame in frames:
        out.write(np.uint8(frame))

    out.release()
    cap.release()

In [None]:
for video_filename in tqdm(os.listdir(input_dir), "Extend videos"):
    if video_filename.endswith(".mp4"):
        video_path = os.path.join(input_dir, video_filename)
        output_path = os.path.join(output_dir, video_filename)
        process_video(video_path, output_path)

print("Videos extended and saved!")

### Create label.csv files specifically for train und test split
Not necessarily required. Generates .csv files containing unique instructions and their IDs, but additionally separates them into individual files for the train and test splits.

In [27]:
labels_df = pd.read_csv(f"{dataset_path}/labels.csv")


def extract_ids(txt_file):
    with open(txt_file, "r") as file:
        lines = file.readlines()
    ids = {line.split()[1] for line in lines}
    return ids


# get id from .txt files
train_ids = extract_ids(f"{dataset_path}/train.txt")
test_ids = extract_ids(f"{dataset_path}/test.txt")

train_filtered = labels_df[labels_df["id"].astype(str).isin(train_ids)]
test_filtered = labels_df[labels_df["id"].astype(str).isin(test_ids)]

train_filtered = train_filtered.drop_duplicates(subset="id")
test_filtered = test_filtered.drop_duplicates(subset="id")

train_filtered.to_csv(f"{dataset_path}/train.csv", index=False)
test_filtered.to_csv(f"{dataset_path}/test.csv", index=False)

print("CSV-files successfully created.")

CSV-files successfully created.


## Generate splits for 5-fold cross-validation
To accurately assess the model's performance, cross-validation in conducted. This requires dividing the dataset into five distinct subsets.

In [3]:
# get whole dataset
with open("humanedit/test.txt", encoding='utf-8') as file:
    test_split = file.read()

with open("humanedit/train.txt", encoding='utf-8') as file:
    train_split = file.read()
    
entire_split = test_split + "\n" + train_split
entire_split = entire_split.split('\n')

In [13]:
# shuffle
random.shuffle(entire_split)

In [17]:
# get k=5 splits
k=5
folds = [entire_split[i::k] for i in range(k)]
k_fold_datasets = []
for i in range(k):
    test_set = folds[i]
    train_set = [item for j, fold in enumerate(folds) if j != i for item in fold]
    k_fold_datasets.append((train_set, test_set))

In [22]:
# save files
output_dir="humanedit/5f_cv"
os.makedirs(output_dir, exist_ok=True)

for i, (train, test) in enumerate(k_fold_datasets):
    with open(f"{output_dir}/train_fold_{i+1}.txt", "w") as f:
        f.writelines("\n".join(train) + "\n")
    with open(f"{output_dir}/test_fold_{i+1}.txt", "w") as f:
        f.writelines("\n".join(test) + "\n")