In [1]:
from pixel_datasets.glue_dataset_generator import GlueDatasetForPixel
from pixel_datasets.dataset_transformations import SyntheticDatasetTransform, SimpleTorchTransform
import numpy as np
import pandas as pd
from PIL import Image
import datasets
import wandb
import glob
from pixel_datasets.utils.dataset_utils import CustomFont
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict

In [2]:
wandb.init(config="/home/knf792/PycharmProjects/pixel-2/configs/glue_config.yaml", mode="disabled")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




In [3]:
def get_datasets(args, seed=42):
    rng = np.random.RandomState(seed)
    transform = SimpleTorchTransform(args, rng)

    train_dataset = GlueDatasetForPixel(
        config=args, task=args.task_name, split="train", transform=transform, rng=rng
    )
    test_dataset = GlueDatasetForPixel(
        config=args,
        task=args.task_name,
        split="validation",
        transform=transform,
        rng=rng,
    )

    return train_dataset, test_dataset


def convert_instance_to_dataset(instance):
    new_instance = {}
    image = instance["pixel_values"].numpy()
    image = (image * 255).astype(np.uint8)
    image = np.transpose(image, (1, 2, 0))
    new_instance["image"] = Image.fromarray(image)
    new_instance["label"] = instance["label"]
    return new_instance

In [30]:
task = "mrpc"
wandb.config.update({"task_name": task}, allow_val_change=True)
train_dataset, test_dataset = get_datasets(wandb.config)

Found cached dataset glue (/projects/copenlu/data/nadav/cache/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached shuffled indices for dataset at /projects/copenlu/data/nadav/cache/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-d50e90fd6eb0dceb.arrow
Found cached dataset glue (/projects/copenlu/data/nadav/cache/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached shuffled indices for dataset at /projects/copenlu/data/nadav/cache/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-88b88f4d7a1e8d49.arrow


In [31]:
new_test_dataset_as_dict = {"image": [], "label": []}
for i in tqdm(range(len(test_dataset))):
    instance = convert_instance_to_dataset(test_dataset[i])
    new_test_dataset_as_dict["image"].append(instance["image"])
    new_test_dataset_as_dict["label"].append(instance["label"])

num_labels = len(set(new_test_dataset_as_dict["label"]))
print(f"Number of labels: {num_labels}")

  0%|          | 0/408 [00:00<?, ?it/s]

Number of labels: 2


In [32]:
new_train_dataset_as_dict = {"image": [], "label": []}
for i in tqdm(range(len(train_dataset))):
    instance = convert_instance_to_dataset(train_dataset[i])
    new_train_dataset_as_dict["image"].append(instance["image"])
    new_train_dataset_as_dict["label"].append(instance["label"])

num_labels = len(set(new_train_dataset_as_dict["label"]))
print(f"Number of labels: {num_labels}")

  0%|          | 0/3668 [00:00<?, ?it/s]

Number of labels: 2


In [35]:
new_test_dataset = Dataset.from_dict(new_test_dataset_as_dict, features=datasets.Features({"image": datasets.Image(), "label": datasets.ClassLabel(num_classes=num_labels)}))
new_train_dataset = Dataset.from_dict(new_train_dataset_as_dict, features=datasets.Features({"image": datasets.Image(), "label": datasets.ClassLabel(num_classes=num_labels)}))
dataset = DatasetDict({"train": new_train_dataset, "validation": new_test_dataset})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 408
    })
})


In [36]:
dataset.save_to_disk(f"/projects/copenlu/data/nadav/Datasets/pixel_glue_{task}/dataset")
dataset.push_to_hub(f"pixel_glue_{task}", token="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")

Saving the dataset (0/1 shards):   0%|          | 0/3668 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/408 [00:00<?, ? examples/s]

Pushing split train to the Hub.


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split validation to the Hub.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]