In [1]:
import json
import pandas as pd
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel("../data/Full_Benchmark.xlsx")

In [3]:
df.columns

Index(['QID', 'prompt', 'solution', 'question_type', 'categories',
       'failure_modes', 'link', 'author', 'valid_flag', 'valid_reason',
       'sheet', 'img_path', 'comments', 'missing_image'],
      dtype='object')

In [4]:
# Create a map from 'author' name to hash
import hashlib
unique_authors = df['author'].dropna().unique()
author_map = {author: hashlib.sha256(str(author).encode()).hexdigest()[:8] for author in unique_authors}

# Replace author names with their hashes in the dataframe
df['author'] = df['author'].map(author_map)

# Save the author map to a JSON file in the data/ directory
import json
with open("../data/author_map.json", "w") as f:
    json.dump(author_map, f, indent=4)

In [5]:
# Remove rows with missing images (reporting how many were removed)
initial_len = len(df)
df = df[df['missing_image'].isna()]
print(f"Removed {initial_len - len(df)} rows with missing images")

Removed 1 rows with missing images


In [6]:
# For all questions with an input image, locate the image file in the data/original_images/ directory,
# rename the image to match the 'id' column value (maintaining the original file extension), and copy it to data/images/,
# and update the dataframe to reflect the new image path. In case of errors (e.g., file not found), report them.
import os
import shutil

# Ensure the destination directory exists
os.makedirs("../data/images", exist_ok=True)

for index, row in df.iterrows():
    if pd.notna(row['img_path']):
        original_filename = row['img_path']
        # Construct the full path to the original image
        original_path = os.path.join("../data/original_images", str(original_filename))
        
        if os.path.exists(original_path):
            # Extract the file extension
            _, ext = os.path.splitext(str(original_filename))
            # Construct the new filename using the 'QID' (assuming this corresponds to 'id')
            new_filename = f"img_q{row['QID']}{ext}"
            new_path = os.path.join("../data/images", new_filename)
            
            try:
                # Copy the file to the new location
                shutil.copy2(original_path, new_path)
                # Update the dataframe with the new path
                df.at[index, 'img_path'] = new_path
            except Exception as e:
                print(f"Error copying image for QID {row['QID']}: {e}")
        else:
            print(f"Error: Original image not found for QID {row['QID']}: {original_path}")

In [7]:
df[df["valid_flag"] != 1]

Unnamed: 0,QID,prompt,solution,question_type,categories,failure_modes,link,author,valid_flag,valid_reason,sheet,img_path,comments,missing_image
9,10,Rotate the table clockwise,The correct output is the given table rotated ...,text-only,,,https://chatgpt.com/share/68e3bbf2-4a84-800d-9...,5e7f2ab4,0.0,the user prompt is incomplete and there are mi...,chengkun-questions,,,
13,14,,I asked the model to count the number of disti...,text-only,,,https://chatgpt.com/share/e/68e3d089-1be8-800e...,5e7f2ab4,0.0,The link is inaccessible,chengkun-questions,,,
14,15,Look at this image On the left is a maze and o...,Models fail to solve simple maze puzzles. Othe...,multi-to-image,image-generation,OOD,https://chatgpt.com/share/68e62234-53dc-800b-b...,866efed1,0.0,Link is not working,chengkun-questions,,,
15,16,"what's the top view, choose from the following...",A,multi-to-text,logic,visual reasoning,https://chatgpt.com/share/694029ac-9ec0-800b-a...,884bb8f3,0.0,The image does not appear in the linked questi...,chengkun-questions,,,
16,17,Label the vertices in this L shaped polygon in...,image,multi-to-image,counting,counting,https://chatgpt.com/share/68e62273-b4ac-800b-9...,866efed1,0.0,Link is not working,chengkun-questions,,,
23,24,Generate the image of a dice pair showing 4 a...,"TBH, I don't fully understand the question. Be...",text-to-image,image-generation,pattern-matching,https://chatgpt.com/share/68e3b42f-ee38-8008-9...,109ed478,0.0,,chengkun-questions,,,
79,80,generate a picture of a husky and a giant pand...,The correct output is two generated images:\n\...,text-to-image,image gen,OOD,https://chatgpt.com/share/68e42d97-fabc-8001-a...,72730b85,0.0,multi-turn,zhipeng - questions,,,
91,92,What happens if you execute the following move...,"right moves to left, left moves to back, back ...",multi-to-image,image gen,OOD,https://chatgpt.com/share/68e39319-4bfc-8004-b...,c2edafb3,0.0,duplicate,zhipeng - questions,../data/images/img_q92.png,,
95,96,Let's say that I have two boolean input variab...,"Not valid. The model answers first no linear, ...",text-only,math,"understanding, hard instruction",https://chatgpt.com/share/68e42d78-10a4-800c-8...,a1579311,0.0,,zhipeng - questions,,,
96,97,Is KL divergence bilinear to the two input ? o...,Not valid. The model answers not linear.,text-only,math,"understanding, hard instruction",https://chatgpt.com/share/68e42e52-172c-800c-9...,a1579311,0.0,,zhipeng - questions,,,


In [8]:
# Only keep valid, print number of invalid
print(f'{len(df[df["valid_flag"] != 1.0])} invalid questions')

df = df[df["valid_flag"] == 1]
print(f'{len(df)} valid questions')

40 invalid questions
245 valid questions


In [9]:
df.drop(columns=["missing_image", "sheet", "valid_reason", "valid_flag", "comments"], inplace=True)

In [10]:
# Reduce size of images to max 1024x1024 while maintaining aspect ratio
from PIL import Image

max_size = (1024, 1024)

for index, row in df.iterrows():
    if pd.notna(row['img_path']):
        try:
            with Image.open(row['img_path']) as img:
                # Check if resize is needed
                if img.size[0] > max_size[0] or img.size[1] > max_size[1]:
                    img.thumbnail(max_size)
                    img.save(row['img_path'])
        except Exception as e:
            print(f"Error processing image {row['img_path']}: {e}")

In [11]:
def clean_link_field(link):
	values_to_remove = [
		"(Gemini 2.5 Flash)",
		"(ChatGPT 5)",
		"(Gemini 2.5 Pro)\n",
		" (Gemini Pro 2.5)\n",
		" (Gemini 2.5 Flash)\n",
	]
	for v in values_to_remove:
		if v in link:
			link = link.split(v)[0]

	return link.strip()


df['link'] = df['link'].apply(clean_link_field)

# Drop rows where both 'prompt' and 'image' are missing
initial_len = len(df)
df = df[~(df['prompt'].isna() & df['img_path'].isna())]
print(f"Removed {initial_len - len(df)} rows with both 'prompt' and 'image' missing")
df['prompt'] = df['prompt'].fillna('')

Removed 1 rows with both 'prompt' and 'image' missing


In [12]:
# Create a Dataset from the DataFrame and push to huggingface hub
from datasets import Dataset, Image, DatasetDict

# Rename img_path to image for standard naming
if 'img_path' in df.columns:
    df = df.rename(columns={'img_path': 'image'})

def get_image_path(path):
    if isinstance(path, str) and os.path.exists(path):
        return path
    return None

df['image'] = df['image'].apply(get_image_path)

# Create dataset
ds = Dataset.from_pandas(df, preserve_index=False)

# Cast the image column to Image feature
ds = ds.cast_column("image", Image())

df_dict = DatasetDict({"test": ds})

In [13]:
# Push to hub
df_dict.push_to_hub("matsant01/blind-spots-bench")

Map: 100%|██████████| 244/244 [00:00<00:00, 14892.03 examples/s]hards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 48.18ba/s]
Processing Files (1 / 1): 100%|██████████| 11.4MB / 11.4MB, 9.50MB/s  
New Data Upload: 100%|██████████|  307kB /  307kB,  256kB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.37s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/matsant01/blind-spots-bench/commit/63a2ffab518bd88a4bdd4900968f87177b5e9cc1', commit_message='Upload dataset', commit_description='', oid='63a2ffab518bd88a4bdd4900968f87177b5e9cc1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/matsant01/blind-spots-bench', endpoint='https://huggingface.co', repo_type='dataset', repo_id='matsant01/blind-spots-bench'), pr_revision=None, pr_num=None)

In [14]:
df["question_type"].value_counts()

question_type
text-only                114
text-to-image             80
multi-to-text             37
multi-to-image            12
text-to-image-to-text      1
Name: count, dtype: int64