In [1]:
import sys
import os
import io

# sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))

import torch
import numpy as np
from transformers import PerceiverFeatureExtractor, PerceiverTokenizer, PerceiverForMultimodalAutoencoding, PerceiverForImageClassificationFourier
from datasets import load_dataset
import hydra

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
# create multimodal inputs
images = torch.randn((1, 16, 3, 224, 224))
audio = torch.randn((1, 30720, 1))
inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))

multimodal_perceiver = PerceiverForMultimodalAutoencoding.from_pretrained("deepmind/multimodal-perceiver")

# in the Perceiver IO paper, videos are auto-encoded in chunks
# each chunk subsamples different index dimensions of the image and audio modality decoder queries
nchunks = 128
image_chunk_size = np.prod((16, 224, 224)) // nchunks
audio_chunk_size = audio.shape[1] // multimodal_perceiver.config.samples_per_patch // nchunks
# process the first chunk
chunk_idx = 0
subsampling = {
    "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
    "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
    "label": None,
}

In [43]:
out = multimodal_perceiver(inputs=inputs, subsampled_output_points=subsampling)

In [47]:
vision_perceiver = PerceiverForImageClassificationFourier.from_pretrained('deepmind/vision-perceiver-fourier')

In [30]:
multimodal_preprocessor = multimodal_perceiver.perceiver.input_preprocessor
multimodal_encoder = multimodal_perceiver.perceiver

In [45]:
multimodal_preprocessor_outputs = multimodal_preprocessor(inputs)

In [50]:
multimodal_preprocessor_outputs[0].size()

torch.Size([1, 52097, 704])

In [37]:
multimodal_preprocessor_vision = multimodal_preprocessor.modalities["image"]

In [48]:
image = torch.randn((32, 3, 224, 224))

vision_preprocessor = vision_perceiver.perceiver.input_preprocessor

image_out, _, _ = vision_preprocessor(image)

In [49]:
image_out.size()

torch.Size([32, 50176, 261])

In [42]:
perceiver_tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [53]:
import transformers
transformers.utils.logging.get_verbosity()

30

In [15]:
from transformers import Wav2Vec2FeatureExtractor

import numpy as np

extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base')

input_values = np.random.randn(16000)

pad_to_multiple_of = 96

max_length = input_values.shape[0]

max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

padded = extractor(input_values, pad_to_multiple_of=96, padding='longest', return_tensors='pt', sampling_rate=16000)

print(padded.input_values.size())

In [49]:
tokens = perceiver_tokenizer("This is an incomplete sentence where some words are missing.")

In [51]:
print(len(tokens['input_ids']))

62


In [None]:
wikipedia = load_dataset("wikipedia", "20220301.en", cache_dir="E:/Datasets/")

In [10]:
len(wikipedia['train'])

6458670

In [13]:
# in the Perceiver IO paper, videos are auto-encoded in chunks
# each chunk subsamples different index dimensions of the image and audio modality decoder queries
nchunks = 128
image_chunk_size = np.prod((1, 224, 224)) // nchunks # np.prod((16, 224, 224)) // nchunks
audio_chunk_size = audio.shape[1] // model.config.samples_per_patch // nchunks
# process the first chunk
chunk_idx = 0
subsampling = {
    "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
    "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
    "label": None,
}

In [None]:
inputs, _, _ = model(inputs, subsampling)

In [22]:
import ast
ast.literal_eval("(1920, 434)")

(1920, 434)

In [3]:
from transformers import Wav2Vec2FeatureExtractor

In [5]:
class AbstractPreprocessor(torch.nn.Module):
    @property
    def num_channels(self) -> int:
        """Returns size of preprocessor output."""
        raise NotImplementedError()

class PerceiverTextPreprocessor(AbstractPreprocessor):
    """
    Text preprocessing for Perceiver Encoder. Can be used to embed `inputs` and add positional encodings.
    The dimensionality of the embeddings is determined by the `d_model` attribute of the configuration.
    Args:
        config ([`PerceiverConfig`]):
            Model configuration.
    """

    def __init__(
        self, 
        d_model: int,
        vocab_size: int,
        max_position_embeddings: int
        ) -> None:
        super().__init__()
        self.d_model = d_model
        self.embeddings = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        self.position_embeddings = torch.nn.Embedding(max_position_embeddings, d_model)

    @property
    def num_channels(self) -> int:
        return self.d_model

    def forward(self, inputs: torch.LongTensor) -> torch.FloatTensor:
        embeddings = self.embeddings(inputs)

        seq_length = inputs.shape[1]
        position_ids = torch.arange(0, seq_length, device=inputs.device)
        embeddings = embeddings + self.position_embeddings(position_ids)

        return embeddings, None, None

In [6]:
tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
preprocessor = PerceiverTextPreprocessor(d_model=512, vocab_size=262, max_position_embeddings=2048)

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [7]:
text = "The quick brown fox jumps over the lazy dog."
tokens = tokenizer(text, return_tensors='pt')['input_ids']

In [11]:
inputs, _, _ = preprocessor(tokens)
inputs.size()

torch.Size([1, 46, 512])

In [12]:
from datasets import load_dataset

In [24]:
libri_dummy = load_dataset('patrickvonplaten/librispeech_asr_dummy', 'clean')

Reusing dataset librispeech_asr_dummy (C:\Users\marco\.cache\huggingface\datasets\patrickvonplaten___librispeech_asr_dummy\clean\2.1.0\f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
100%|██████████| 1/1 [00:00<00:00, 499.44it/s]


In [36]:
type(libri_dummy['validation'][0]['audio']['array'])
# libri_dummy['validation'][0]['audio']['array']

numpy.ndarray

In [6]:
import torch 

x = torch.randn(3, 64, 64)
y = torch.randn(3, 64, 64)
z = torch.stack([x, y], dim=0)

print(z.size())

torch.Size([2, 3, 64, 64])
