In [None]:
import json
import pandas as pd
import datasets

In [None]:
df = pd.read_excel("../data/Full_Benchmark.xlsx")

In [None]:
df.columns

In [None]:
# Create a map from 'author' name to hash
import hashlib
unique_authors = df['author'].dropna().unique()
author_map = {author: hashlib.sha256(str(author).encode()).hexdigest()[:8] for author in unique_authors}

# Replace author names with their hashes in the dataframe
df['author'] = df['author'].map(author_map)

# Save the author map to a JSON file in the data/ directory
import json
with open("../data/author_map.json", "w") as f:
    json.dump(author_map, f, indent=4)

In [None]:
# Remove rows with missing images (reporting how many were removed)
initial_len = len(df)
df = df[df['missing_image'].isna()]
print(f"Removed {initial_len - len(df)} rows with missing images")

In [None]:
# For all questions with an input image, locate the image file in the data/original_images/ directory,
# rename the image to match the 'id' column value (maintaining the original file extension), and copy it to data/images/,
# and update the dataframe to reflect the new image path. In case of errors (e.g., file not found), report them.
import os
import shutil

# Ensure the destination directory exists
os.makedirs("../data/images", exist_ok=True)

for index, row in df.iterrows():
    if pd.notna(row['img_path']):
        original_filename = row['img_path']
        # Construct the full path to the original image
        original_path = os.path.join("../data/original_images", str(original_filename))
        
        if os.path.exists(original_path):
            # Extract the file extension
            _, ext = os.path.splitext(str(original_filename))
            # Construct the new filename using the 'QID' (assuming this corresponds to 'id')
            new_filename = f"img_q{row['QID']}{ext}"
            new_path = os.path.join("../data/images", new_filename)
            
            try:
                # Copy the file to the new location
                shutil.copy2(original_path, new_path)
                # Update the dataframe with the new path
                df.at[index, 'img_path'] = new_path
            except Exception as e:
                print(f"Error copying image for QID {row['QID']}: {e}")
        else:
            print(f"Error: Original image not found for QID {row['QID']}: {original_path}")

In [None]:
df[df["valid_flag"] != 1]

In [None]:
# Only keep valid, print number of invalid
print(f'{len(df[df["valid_flag"] != 1.0])} invalid questions')

df = df[df["valid_flag"] == 1]
print(f'{len(df)} valid questions')

In [None]:
df.drop(columns=["missing_image", "sheet", "valid_reason", "valid_flag", "comments"], inplace=True)

In [None]:
# Reduce size of images to max 1024x1024 while maintaining aspect ratio
from PIL import Image

max_size = (1024, 1024)

for index, row in df.iterrows():
    if pd.notna(row['img_path']):
        try:
            with Image.open(row['img_path']) as img:
                #Â Check if resize is needed
                if img.size[0] > max_size[0] or img.size[1] > max_size[1]:
                    img.thumbnail(max_size)
                    img.save(row['img_path'])
        except Exception as e:
            print(f"Error processing image {row['img_path']}: {e}")

In [None]:
def clean_link_field(link):
	values_to_remove = [
		"(Gemini 2.5 Flash)",
		"(ChatGPT 5)",
		"(Gemini 2.5 Pro)\n",
		" (Gemini Pro 2.5)\n",
		" (Gemini 2.5 Flash)\n",
	]
	for v in values_to_remove:
		if v in link:
			link = link.split(v)[0]

	return link.strip()


df['link'] = df['link'].apply(clean_link_field)

# Drop rows where both 'prompt' and 'image' are missing
initial_len = len(df)
df = df[~(df['prompt'].isna() & df['img_path'].isna())]
print(f"Removed {initial_len - len(df)} rows with both 'prompt' and 'image' missing")
df['prompt'] = df['prompt'].fillna('')

In [None]:
# Create a Dataset from the DataFrame and push to huggingface hub
from datasets import Dataset, Image, DatasetDict

# Rename img_path to image for standard naming
if 'img_path' in df.columns:
    df = df.rename(columns={'img_path': 'image'})

def get_image_path(path):
    if isinstance(path, str) and os.path.exists(path):
        return path
    return None

df['image'] = df['image'].apply(get_image_path)

# Create dataset
ds = Dataset.from_pandas(df, preserve_index=False)

# Cast the image column to Image feature
ds = ds.cast_column("image", Image())

df_dict = DatasetDict({"test": ds})

In [None]:
old_ds = datasets.load_dataset("matsant01/blind-spots-bench", split="test")

In [None]:
for i in range(len(df_dict["test"])):
	if old_ds[i] != df_dict["test"][i]:
		print(f"ID mismatch at index {i}:")
		
		for key in old_ds[i].keys():
			if old_ds[i][key] != df_dict['test'][i][key]:
				print(f"Mismatch in field '{key}':")
				print("OLD:", old_ds[i][key])
				print("NEW:", df_dict['test'][i][key])

		print("\n\n" + "-" * 40 + "\n\n")

In [None]:
df["question_type"].value_counts()

In [None]:
# Push to hub
df_dict.push_to_hub("matsant01/blind-spots-bench")