In [46]:
import wandb
from pixel_datasets.squad_dataset_for_pixel import SquadDatasetForPixel
from pixel_datasets.real_dataset import HistoricDatasetForPretraining
from pixel_datasets.dataset_transformations import (SimpleTorchTransform, SyntheticDatasetTransform)
from pixel_datasets.utils.squad_utils import convert_pixel_mask_to_patch_mask
from PIL import Image
import numpy as np
import datasets
from datasets import load_dataset, load_from_disk
import random
from glob import glob
import os

In [44]:
QUESTIONS = {
 "Disease": "What kind of diseases does the person have?",
 "Runaway date": "What was the date of the event?",
 "Total reward": "How much reward is offered?",
 "Given name": "What is the given name of the person?",
 "Origin": "Where does the person originate from?",
 "Owner": "Who is the owner of the person?",
 "Given surname": "What is the last name of the person?",
 "Injuries": "How was the person injured?",
 "Literacy": "What is the literacy level of the person?",
 "Name of contact": "Who is the contact person for the ad?",
 "Ran from region": "What is the name of the region from which the person escaped?",
 "Skills": "What is the set of skills of the person?",
 "Owner occupation": "What does the owner of the person do for a living?",
 "Motivation": "Why did the person escape his owner?",
 "Religion": "What is the religion of the person?",
 "Ran from specified": "What is the name of the place from which the person escaped?",
 "Contact occupation": "What does the contact of the ad do for a living?",
 "Physical characteristics": "What are the physical characteristics of the person?",
 "Stutters": "Does the person stutter?",
 "Plantation marks": "What plantation marks does the person have?",
 "Accused of crime": "What crimes did the person commit?",
 "Specified occupation": "What does the person do for a living?",
 "Racial descriptor": "What is the ethnicity of the person?",
 "Other reward": "What other rewards were offered?",
 "Owner address": "Where does the owner of the person live?",
 "Clothing": "What clothes did the person wear?",
 "Country marks": "What country marks does the person have?",
 "Destination (specified)": "What is the name of the destination?",
 "Also known as": "What other aliases does the person have?",
"Destination (region)": "What is the destination region of the person?",
"Language": "What are the communication skills of the person?",
 "Contact address": "Where does the contact person of the ad live?",
 "Physical scars": "What scars does the person have?",
 "Companions": "What are the names of the friends of the person?",
 "Personality": "What are the personality traits of the person?"
}

NUMERAL_ATTRIBUTES = {
    "Value pounds",
    "Value shillings",
    "Value pence",
    "Reward pounds",
    "Reward shillings",
    "Reward pence",
    "Age",
    "Height",
    "Total value",
}
BINARY_ATTRIBUTES = {"Warning notice", "Ran from ship", "Gender"}

In [33]:
IMAGE_FOLDER = "/projects/copenlu/data/nadav/pixel/runaways_scans/{}.png"
dataset_root_path = "/projects/copenlu/data/nadav/Datasets/runaways_visual/"

rng = np.random.RandomState(42)
wandb.init(config="/home/knf792/PycharmProjects/pixel-2/configs/squad_config.yaml", mode="disabled")
transform = SyntheticDatasetTransform(wandb.config, rng=rng)
squad_dataset = SquadDatasetForPixel(config=wandb.config, transform=transform, rng=rng)

Found cached dataset squad_v2 (/home/knf792/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


In [23]:
def resize_image(args, image):
    """
    Resize the image to the specified size.
    """
    num_patches = 529
    if type(image) == np.ndarray:
        image = Image.fromarray(image)

    width, length = image.size
    if (
        width != args.image_width
    ):  # resize the image if the width is not the same as the specified width, without changing the aspect ratio
        ratio = width / args.image_width
        new_length = int(length / ratio)
        image = image.resize((args.image_width, new_length), Image.LANCZOS)

    if image.size[1] > args.image_height:  # crop the image if it's too long
        image = image.crop((0, 0, image.size[0], args.image_height))

    if args.embed_real_image:
        if image.size[1] < args.image_width:
            embedded_image = Image.new(
                "RGB",
                (args.image_width, args.image_height),
                (255, 255, 255),
            )
            embedded_image.paste(image, (0, 0))
            image = embedded_image

    else:  # we add black pixels to the image to make it square, and change the attention mask accordingly
        if image.size[1] == args.image_width:
            pass
        else:
            num_patches = (
                (args.image_width - image.size[1])
                // args.patch_base_size[0]
            ) * (args.image_height // args.patch_base_size[1])
            embedded_image = Image.new(
                "RGB",
                (args.image_width, args.image_height),
                (255, 255, 255),
            )
            embedded_image.paste(image, (0, 0))
            image = embedded_image
    return image, num_patches


def build_squad_example_from_instance(instance, squad_generator, args):
    image_id = instance["ID"][1:5]
    image = Image.open(f"/projects/copenlu/data/nadav/pixel/runaways_scans/{image_id}.png")
    resized_image, num_patches = resize_image(wandb.config, image)
    resized_image = np.array(resized_image)
    
    question = QUESTIONS[instance["attribute"]]
    question_image = squad_generator._generate_question(question)
    question_image = np.stack([question_image] * 3, axis=2)
    ad_image = np.concatenate([question_image, resized_image], axis=0)
    ad_image = ad_image[: args.image_height]
    
    answer = instance["answers"]["text"][0] if instance["answers"]["text"] else ""
    pixel_mask = squad_dataset.image_generator.generate_pixel_mask(ad_image, answer)
    return ad_image, pixel_mask

In [37]:
text_dataset = load_from_disk("/projects/copenlu/data/nadav/pixel/runaway_dataset")
all_ids = list(set(map(lambda x: x["ID"][1:5], text_dataset)))

random.shuffle(all_ids)
test_size = int(len(all_ids) * 0.2)
test_ids = all_ids[: test_size]
train_ids = all_ids[test_size:]

In [None]:
for instance in text_dataset:
    if os.path.exists(f"{dataset_root_path}/test/images/{instance['ID']}.png"):
        continue
    if instance["attribute"] not in QUESTIONS:
        continue
    if instance["ID"][1:5] in test_ids:
        print(instance["ID"])
        try:
            image, mask = build_squad_example_from_instance(instance, squad_dataset.image_generator, wandb.config)
        except FileNotFoundError:
            continue
        mask = convert_pixel_mask_to_patch_mask(
                mask,
                wandb.config.patch_base_size[0],
                wandb.config.mask_patching_tolerance,
            )
        mask = mask.astype("uint8")
        np.save(f"{dataset_root_path}/test/labels/{instance['ID']}.npy", mask)
        image = Image.fromarray(image)
        image.save(f"{dataset_root_path}/test/images/{instance['ID']}.png")
    


In [49]:
for instance in text_dataset:
    if os.path.exists(f"{dataset_root_path}/train/images/{instance['ID']}.png"):
        continue
    if instance["attribute"] not in QUESTIONS:
        continue
    if instance["ID"][1:5] in train_ids:
        print(instance["ID"])
        try:
            image, mask = build_squad_example_from_instance(instance, squad_dataset.image_generator, wandb.config)
        except FileNotFoundError:
            continue
        mask = convert_pixel_mask_to_patch_mask(
                mask,
                wandb.config.patch_base_size[0],
                wandb.config.mask_patching_tolerance,
            )
        mask = mask.astype("uint8")
        np.save(f"{dataset_root_path}/train/labels/{instance['ID']}.npy", mask)
        image = Image.fromarray(image)
        image.save(f"{dataset_root_path}/train/images/{instance['ID']}.png")

r0002Injuries
r0002Stutters
r0002Owner occupation
r0002Contact address
r0002Racial descriptor
r0002Country marks
r0002Destination (region)
r0002Ran from specified
r0002Origin
r0002Plantation marks
r0002Name of contact
r0002Owner
r0002Skills
r0002Disease
r0002Physical scars
r0002Clothing
r0002Accused of crime
r0002Personality
r0002Given surname
r0002Also known as
r0002Total reward
r0002Ran from region
r0002Owner address
r0002Destination (specified)
r0002Language
r0002Other reward
r0002Motivation
r0002Literacy
r0002Runaway date
r0002Specified occupation
r0002Given name
r0002Religion
r0002Physical characteristics
r0002Companions
r0002Contact occupation
r0003Injuries
r0003Stutters
r0003Owner occupation
r0003Contact address
r0003Racial descriptor
r0003Country marks
r0003Destination (region)
r0003Ran from specified
r0003Origin
r0003Plantation marks
r0003Name of contact
r0003Owner
r0003Skills
r0003Disease
r0003Physical scars
r0003Clothing
r0003Accused of crime
r0003Personality
r0003Given surn

In [69]:
dataset_root_path = "/projects/copenlu/data/nadav/Datasets/pixel_squad_cannon/"

test_images = glob(f"{dataset_root_path}/test/images/*.png")
test_labels = [path.replace("images", "labels").replace(".png", ".npy") for path in test_images]


test_labels = [np.load(l) for l in test_labels]
test_dataset = datasets.Dataset.from_dict({"image": test_images, "label": test_labels},
                                     features=datasets.Features({"image": datasets.Image(),
                                                                 "label": datasets.Array2D(shape=(23, 23), dtype="uint8")}))

In [70]:
train_images = glob(f"{dataset_root_path}/train/images/*.png")
train_labels = [path.replace("images", "labels").replace(".png", ".npy") for path in train_images]


train_labels = [np.load(l) for l in train_labels]
train_dataset = datasets.Dataset.from_dict({"image": train_images, "label": train_labels},
                                     features=datasets.Features({"image": datasets.Image(),
                                                                 "label": datasets.Array2D(shape=(23, 23), dtype="uint8")}))

In [71]:
dataset = datasets.DatasetDict({"train": train_dataset, "test": test_dataset})
print(dataset)
dataset.save_to_disk("/projects/copenlu/data/nadav/Datasets/pixel_squad_cannon/dataset")
dataset.push_to_hub("pixel_squad_cannon", token="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")

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 222844
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 11873
    })
})


Saving the dataset (0/16 shards):   0%|          | 0/222844 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11873 [00:00<?, ? examples/s]

Pushing split train to the Hub.


Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/16 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13928 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13927 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13927 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13927 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13927 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [73]:
label = []
for i in range(len(dataset["train"])):
   label.append(np.max(dataset["train"][i]["label"]))

np.mean(label)

0.5962736263933514