In [1]:
from multiprocessing import Pool
import os, csv, random, json, shutil
from PIL import Image, ImageDraw
from functools import partial
from datasets import load_dataset
from huggingface_hub import HfApi

def generate_images(start_idx, end_idx, image_width, image_height, radius, image_dir):
    results = []
    for i in range(start_idx, end_idx):
        # Create image
        img = Image.new("RGB", (image_width, image_height), "black")
        draw = ImageDraw.Draw(img)
        
        # Generate random coordinates for circle center
        x = random.randint(radius, image_width - radius)
        y = random.randint(radius, image_height - radius)
        
        # Draw the circle
        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill="white")
        
        # Save image
        image_name = f"image_{i}.png"
        image_path = os.path.join(image_dir, image_name)
        img.save(image_path)
        
        # Create instruction similar to the reference dataset
        instruction = f"Return the coordinates of the white circle in the image. It's used to locate points of interest and if we click it we can select the circle for further analysis."
        
        # Format the data exactly like the reference dataset - as a Python list of dict, not JSON
        messages = [
            {
                "assistant": f"[{x},{y}]",
                "source": "synthetic_generator",
                "user": instruction
            }
        ]
        
        # Store the messages as a repr string to preserve Python structure
        results.append((image_name, repr(messages)))
        
        # Print progress every 1000 samples
        if i > 0 and i % 1000 == 0:
            print(f"Processed {i} samples...")
    
    return results

if __name__ == "__main__":
    username = "jwaters8978"
    repo_name = "synthetic_dataset"
    num_samples = 16000  
    image_width, image_height = 1280, 720
    radius = 4
    
    # Setup directories
    dataset_dir = "synthetic_dataset"
    
    # Clean up existing directory if it exists
    if os.path.exists(dataset_dir):
        print(f"Removing existing directory: {dataset_dir}")
        shutil.rmtree(dataset_dir)
    
    # Create fresh directories
    image_dir = os.path.join(dataset_dir, "train", "class0")
    os.makedirs(image_dir, exist_ok=True)
    
    # Determine how many processes and split work
    num_processes = 100
    chunk_size = num_samples // num_processes
    ranges = [(i*chunk_size, (i+1)*chunk_size if i < num_processes-1 else num_samples)
              for i in range(num_processes)]
    
    # Prepare arguments for multiprocessing
    args = [(start, end, image_width, image_height, radius, image_dir) for start, end in ranges]
    
    # Generate images and metadata using multiple processes
    with Pool(processes=num_processes) as pool:
        results_list = pool.starmap(generate_images, args)
    
    # Flatten results
    all_results = [item for sublist in results_list for item in sublist]
    
    # Write metadata to the same directory as the images
    metadata_path = os.path.join(dataset_dir, "train", "class0", "metadata.csv")
    with open(metadata_path, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["file_name", "texts"])
        writer.writerows(all_results)
    
    # Load and push the dataset
    dataset = load_dataset("imagefolder", data_dir=dataset_dir)
    dataset = dataset.rename_column('image', 'images')
    
    # Add function to convert string representation back to Python object
    def convert_texts(example):
        example['texts'] = eval(example['texts'])
        return example
    
    # Apply the conversion
    dataset = dataset.map(convert_texts)
    
    print("\nDataset verification:")
    print("First example texts type:", type(dataset['train'][0]['texts']))
    print("First example texts content:", dataset['train'][0]['texts'])
    
    # Push to HuggingFace
    api = HfApi()
    api.create_repo(repo_id=f"{username}/{repo_name}", repo_type="dataset", exist_ok=True)
    dataset.push_to_hub(f"{username}/{repo_name}")
    print(f"Pushed dataset to {username}/{repo_name}")

Removing existing directory: synthetic_dataset
Processed 4000 samples...
Processed 8000 samples...
Processed 12000 samples...
Processed 5000 samples...
Processed 1000 samples...
Processed 9000 samples...
Processed 13000 samples...
Processed 14000 samples...
Processed 6000 samples...
Processed 10000 samples...
Processed 2000 samples...
Processed 7000 samples...
Processed 11000 samples...
Processed 15000 samples...
Processed 3000 samples...


Resolving data files:   0%|          | 0/16001 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/16002 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]


Dataset verification:
First example texts type: <class 'list'>
First example texts content: [{'assistant': '[349,214]', 'source': 'synthetic_generator', 'user': "Return the coordinates of the white circle in the image. It's used to locate points of interest and if we click it we can select the circle for further analysis."}]


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/160 [00:00<?, ?ba/s]

Pushed dataset to jwaters8978/synthetic_dataset


In [12]:
# Analysis script for comparing datasets
from datasets import load_dataset

# Load the original dataset
print("=== Original Dataset Analysis ===")
original_dataset = load_dataset("jwaters8978/web_scraper_dataset_2")
print("\nOriginal dataset structure:")
print(original_dataset)

print("\nColumn names:", original_dataset['train'].column_names)
print("\nFirst example:")
example = original_dataset['train'][0]
print("Text type:", type(example['texts']))
print("Text content:", example['texts'])

# Load our synthetic dataset
print("\n=== Synthetic Dataset Analysis ===")
synthetic_dataset = load_dataset("jwaters8978/synthetic_dataset")
print("\nSynthetic dataset structure:")
print(synthetic_dataset)

print("\nColumn names:", synthetic_dataset['train'].column_names)
print("\nFirst example:")
example = synthetic_dataset['train'][0]
if 'texts' in example:
    print("Text type:", type(example['texts']))
    print("Text content:", example['texts'])
else:
    print("No 'texts' column found!")

=== Original Dataset Analysis ===

Original dataset structure:
DatasetDict({
    train: Dataset({
        features: ['images', 'texts'],
        num_rows: 15937
    })
})

Column names: ['images', 'texts']

First example:
Text type: <class 'list'>
Text content: [{'assistant': '[79.92, 7.64, 85.31, 10.42]', 'source': 'web_scraper', 'user': "Return the bounding box of the Text link with the words 'About Us'. It's used to navigate to the About Us section of the website and if we click it the About Us section will load."}]

=== Synthetic Dataset Analysis ===


README.md:   0%|          | 0.00/312 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/831k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]


Synthetic dataset structure:
DatasetDict({
    train: Dataset({
        features: ['images', 'texts'],
        num_rows: 2000
    })
})

Column names: ['images', 'texts']

First example:
Text type: <class 'str'>
Text content: [{"assistant": "[1116,349]", "source": "synthetic_generator", "user": "Return the coordinates of the white circle in the image. It's used to locate points of interest and if we click it we can select the circle for further analysis."}]
