In [2]:
import torch
import numpy as np
from transformers import PerceiverFeatureExtractor, PerceiverTokenizer, PerceiverForMultimodalAutoencoding
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# create multimodal inputs
images = torch.randn((1, 16, 3, 224, 224))
audio = torch.randn((1, 30720, 1))
inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))

In [4]:
wikipedia = load_dataset("wikipedia", "20220301.en", cache_dir="E:/Datasets/")

Downloading builder script: 35.9kB [00:00, 8.97MB/s]                   
Downloading metadata: 30.4kB [00:00, 3.04MB/s]                   
Reusing dataset wikipedia (E:/Datasets/wikipedia\20220301.en\2.0.0\aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)
100%|██████████| 1/1 [03:09<00:00, 189.21s/it]


In [10]:
len(wikipedia['train'])

6458670

In [4]:
model = PerceiverForMultimodalAutoencoding.from_pretrained("deepmind/multimodal-perceiver")

In [5]:
# in the Perceiver IO paper, videos are auto-encoded in chunks
# each chunk subsamples different index dimensions of the image and audio modality decoder queries
nchunks = 128
image_chunk_size = np.prod((16, 224, 224)) // nchunks
audio_chunk_size = audio.shape[1] // model.config.samples_per_patch // nchunks
# process the first chunk
chunk_idx = 0
subsampling = {
    "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
    "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
    "label": None,
}

In [26]:
model.config

PerceiverConfig {
  "_name_or_path": "deepmind/multimodal-perceiver",
  "architectures": [
    "PerceiverForMultimodalAutoencoding"
  ],
  "attention_probs_dropout_prob": 0.1,
  "audio_samples_per_frame": 1920,
  "cross_attention_shape_for_attention": "kv",
  "cross_attention_widening_factor": 1,
  "d_latents": 512,
  "d_model": 704,
  "hidden_act": "gelu",
  "id2label": {
    "0": "abseiling",
    "1": "acting in play",
    "2": "adjusting glasses",
    "3": "air drumming",
    "4": "alligator wrestling",
    "5": "answering questions",
    "6": "applauding",
    "7": "applying cream",
    "8": "archaeological excavation",
    "9": "archery",
    "10": "arguing",
    "11": "arm wrestling",
    "12": "arranging flowers",
    "13": "arresting",
    "14": "assembling bicycle",
    "15": "assembling computer",
    "16": "attending conference",
    "17": "auctioning",
    "18": "baby waking up",
    "19": "backflip (human)",
    "20": "baking cookies",
    "21": "bandaging",
    "22": "bar

In [14]:
inputs, _, _ = model.perceiver.input_preprocessor(inputs, subsampling)

In [16]:
print(inputs.size())

torch.Size([1, 52097, 704])


In [18]:
batch_size, seq_length, _ = inputs.size()

In [19]:
latent_array = model.perceiver.embeddings(batch_size=batch_size)

In [20]:
latent_array.size()

torch.Size([1, 784, 512])

In [21]:
encoded = model.perceiver.encoder(hidden_states=latent_array, inputs=inputs)

In [25]:
encoded.last_hidden_state.size()

torch.Size([1, 784, 512])

In [20]:
outputs = model(inputs=inputs, subsampled_output_points=subsampling, output_hidden_states=True)

In [63]:
model.config.d_latents

512

In [1]:
y = outputs.hidden_states

NameError: name 'outputs' is not defined

In [51]:
torch.nn.functional.instance_norm(y).size()

torch.Size([1, 784, 512])

In [35]:
preprocessor = model.perceiver.input_preprocessor

In [48]:
preprocessor.modalities["image"](images)[0].size()

torch.Size([1, 50176, 243])

In [49]:
preprocessor.modalities["audio"](audio)[0].size()

torch.Size([1, 1920, 401])

In [64]:
inputs_prcessed, modality_sizes, inputs_without_pos = preprocessor({"image": images, "audio": audio, "label": torch.zeros((images.shape[0], 700))})