In [2]:
import torch

### Flava standard model without heads

In [3]:
from PIL import Image
import requests

from transformers import FlavaProcessor, FlavaModel

model = FlavaModel.from_pretrained("facebook/flava-full")
processor = FlavaProcessor.from_pretrained("facebook/flava-full")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(
    text=["a photo of a cat", "a photo of a dog"],
    images=[image, image],
    return_tensors="pt",
    padding="max_length",
    max_length=77,
)

outputs = model(**inputs)
image_embeddings = (
    outputs.image_embeddings
)  # Batch size X (Number of image patches + 1) x Hidden size => 2 X 197 X 768
text_embeddings = (
    outputs.text_embeddings
)  # Batch size X (Text sequence length + 1) X Hidden size => 2 X 77 X 768
multimodal_embeddings = (
    outputs.multimodal_embeddings
)  # Batch size X (Number of image patches + Text Sequence Length + 3) X Hidden size => 2 X 275 x 768
# Multimodal embeddings can be used for multimodal tasks such as VQA


## Pass only image
from transformers import FlavaFeatureExtractor

feature_extractor = FlavaFeatureExtractor.from_pretrained("facebook/flava-full")
inputs = feature_extractor(images=[image, image], return_tensors="pt")
outputs = model(**inputs)
image_embeddings = outputs.image_embeddings

## Pass only image
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("facebook/flava-full")
inputs = tokenizer(
    ["a photo of a cat", "a photo of a dog"],
    return_tensors="pt",
    padding="max_length",
    max_length=77,
)
outputs = model(**inputs)
text_embeddings = outputs.text_embeddings

  from .autonotebook import tqdm as notebook_tqdm
`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The value `text_config["id2label"]` will be overriden.
`multimodal_config_dict` is provided which will be used to initialize `FlavaMultimodalConfig`. The value `multimodal_config["id2label"]` will be overriden.
`image_codebook_config_dict` is provided which will be used to initialize `FlavaImageCodebookConfig`. The value `image_codebook_config["id2label"]` will be overriden.
Some weights of the model checkpoint at facebook/flava-full were not used when initializing FlavaModel: ['image_codebook.blocks.group_4.group.block_1.res_path.path.conv_1.bias', 'image_codebook.blocks.group_2.group.block_1.res_path.path.conv_2.bias', 'mlm_head.transform.dense.weight', 'mmm_text_head.transform.LayerNorm.weight', 'image_codebook.blocks.group_3.group.block_1.res_path.path.conv_2.bias', 'mmm_image_head.bias', 'image_codebook.blocks.group_4.group.block_2.res_path.path.conv

In [4]:
model

FlavaModel(
  (text_model): FlavaTextModel(
    (embeddings): FlavaTextEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): FlavaEncoder(
      (layer): ModuleList(
        (0-11): 12 x FlavaLayer(
          (attention): FlavaAttention(
            (attention): FlavaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): FlavaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=

## Flava pretraining with heads

In [16]:
from PIL import Image
import requests

from transformers import FlavaProcessor, FlavaForPreTraining

model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
processor = FlavaProcessor.from_pretrained("facebook/flava-full")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

print(image.size)

inputs = processor(
    text=["a photo of a cat", "a photo of a dog"],
    images=[image, image],
    return_tensors="pt",
    padding="max_length",
    max_length=77,
    return_codebook_pixels=True,
    return_image_mask=True,
    # Other things such as mlm_labels, itm_labels can be passed here. See docs
)
inputs.bool_masked_pos.zero_()

print(inputs.keys())

outputs = model(**inputs)
image_embeddings = (
    outputs.image_embeddings
)  # Batch size X (Number of image patches + 1) x Hidden size => 2 X 197 X 768
text_embeddings = (
    outputs.text_embeddings
)  # Batch size X (Text sequence length + 1) X Hidden size => 2 X 77 X 768
# Multimodal embeddings can be used for multimodal tasks such as VQA
multimodal_embeddings = (
    outputs.multimodal_embeddings
)  # Batch size X (Number of image patches + Text Sequence Length + 3) X Hidden size => 2 X 275 x 768

# Loss
loss = outputs.loss  # probably NaN due to missing labels

# Global contrastive loss logits
image_contrastive_logits = outputs.contrastive_logits_per_image
text_contrastive_logits = outputs.contrastive_logits_per_text

# ITM logits
itm_logits = outputs.itm_logits

`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The value `text_config["id2label"]` will be overriden.
`multimodal_config_dict` is provided which will be used to initialize `FlavaMultimodalConfig`. The value `multimodal_config["id2label"]` will be overriden.
`image_codebook_config_dict` is provided which will be used to initialize `FlavaImageCodebookConfig`. The value `image_codebook_config["id2label"]` will be overriden.
`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if you are doing inference on unmasked text...


(640, 480)
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'codebook_pixel_values', 'bool_masked_pos'])




In [5]:
inputs.pixel_values.shape

torch.Size([2, 3, 224, 224])

In [19]:
import torch
from transformers import FlavaImageProcessor, FlavaProcessor


processor = FlavaImageProcessor(do_resize=False)
# processor = FlavaProcessor(image_processor=processor)
proc_image = processor(
    images=torch.rand(3, 128, 1000), return_codebook_pixels=True, return_image_mask=True
)
proc_image.keys()

dict_keys(['pixel_values', 'codebook_pixel_values', 'bool_masked_pos'])

In [20]:
proc_image.pixel_values[0].shape

(3, 224, 224)

In [2]:
model()

FlavaForPreTraining(
  (flava): FlavaModel(
    (text_model): FlavaTextModel(
      (embeddings): FlavaTextEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (encoder): FlavaEncoder(
        (layer): ModuleList(
          (0-11): 12 x FlavaLayer(
            (attention): FlavaAttention(
              (attention): FlavaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
              (output): FlavaSelfOutput(
                (dense): Linear(in_features=768, out_f

In [23]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'codebook_pixel_values', 'bool_masked_pos'])

In [26]:
inputs.codebook_pixel_values.shape

torch.Size([2, 3, 112, 112])

In [11]:
inputs = processor(images=[image])

In [18]:
inputs["pixel_values"]

[array([[[ 0.2807141 ,  0.38290307,  0.42669836, ..., -0.28862458,
          -0.2740262 , -0.28862458],
         [ 0.32450938,  0.38290307,  0.41209993, ..., -0.28862458,
          -0.28862458, -0.31782144],
         [ 0.2807141 ,  0.3537062 ,  0.36830464, ..., -0.37621516,
          -0.3470183 , -0.31782144],
         ...,
         [ 1.6383677 ,  1.5361787 ,  1.4193913 , ...,  1.3901944 ,
           1.2880055 ,  1.2442101 ],
         [ 1.6091708 ,  1.5507771 ,  1.5069818 , ...,  1.2150133 ,
           0.9814385 ,  0.85005265],
         [ 1.6091708 ,  1.477785  ,  1.4923834 , ...,  0.12013142,
          -0.12804192, -0.39081356]],
 
        [[-1.3919107 , -1.3919107 , -1.3919107 , ..., -1.5419884 ,
          -1.5419884 , -1.5569961 ],
         [-1.3468874 , -1.3468874 , -1.3468874 , ..., -1.5269806 ,
          -1.5119728 , -1.5269806 ],
         [-1.4069184 , -1.3769029 , -1.3468874 , ..., -1.5569961 ,
          -1.5419884 , -1.5419884 ],
         ...,
         [-0.3413669 , -0.461429 

## FLAVA Processor

In [1]:
from transformers import FlavaProcessor, FlavaForPreTraining

model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
processor = FlavaProcessor.from_pretrained("facebook/flava-full")

  from .autonotebook import tqdm as notebook_tqdm
`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The value `text_config["id2label"]` will be overriden.
`multimodal_config_dict` is provided which will be used to initialize `FlavaMultimodalConfig`. The value `multimodal_config["id2label"]` will be overriden.
`image_codebook_config_dict` is provided which will be used to initialize `FlavaImageCodebookConfig`. The value `image_codebook_config["id2label"]` will be overriden.


### Flava Image Model 

In [30]:
from transformers import (
    AutoImageProcessor,
    FlavaImageModel,
    FlavaImageConfig,
    FlavaConfig,
    FlavaImageProcessor,
)
import torch
import numpy as np
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]
print(f"Image shape {np.array(image).shape}")

image_processor = FlavaImageProcessor("facebook/flava-full")
image_config = FlavaImageConfig(num_channels=1)
model = FlavaImageModel(image_config)

flava_pre_config = FlavaConfig(image_config={"num_channels": 1})
flava_pretrained = FlavaForPreTraining(flava_pre_config)

inputs = image_processor(
    image,
    return_tensors="pt",
)

# with torch.no_grad():
#     outputs = model(**inputs)

# last_hidden_states = outputs.last_hidden_state
# list(last_hidden_states.shape)

No config specified, defaulting to: cats-image/image
Found cached dataset cats-image (/Users/lukas/.cache/huggingface/datasets/huggingface___cats-image/image/1.9.0/68fbc793fb10cd165e490867f5d61fa366086ea40c73e549a020103dcb4f597e)
100%|██████████| 1/1 [00:00<00:00, 1187.52it/s]


Image shape (480, 640, 3)


In [27]:
img = torch.rand(1, 1, 224, 224)
model(img).keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [29]:
flava_pretrained(pixel_values=img)

ValueError: `codebook_pixel_value` are required to generate `mim_labels` if loss is expected. Call `AutoProcessor` with `return_codebook_pixels` set to True

In [8]:
inputs.pixel_values.shape

torch.Size([1, 3, 224, 224])

In [2]:
import torch

img = torch.rand(3, 224, 224)

model(pixel_values=img)

ValueError: not enough values to unpack (expected 4, got 3)

In [20]:
# random torch tensor image
import torch

audio_image = torch.rand(1, 80, 200)

out = processor(images=audio_image)

ValueError: mean must have 1 elements if it is an iterable, got 3

In [None]:
import os

path = "/Users/lukas/Desktop/Projects/MIT/MIT_prosody/data/audio_debug"

files = os.listdir(path)

In [None]:
tokenizer = BertTokenizer.from_pretrained("facebook/flava-full")
inputs = tokenizer(
    ["a photo of a cat", "a photo of a dog"],
    return_tensors="pt",
    padding="longest",
    max_length=10,
)
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
token_type_ids = inputs.token_type_ids
input_ids

tensor([[ 101, 1037, 6302, 1997, 1037, 4937,  102],
        [ 101, 1037, 6302, 1997, 1037, 3899,  102]])

In [None]:
def _mask_tokens(inputs, attention_mask=None, mask_prob=0.3):
    labels = inputs.clone()

    # Probability matrix should only allow masking where attention_mask is 1
    if attention_mask is not None:
        # Use the attention_mask to limit where tokens can be masked
        probability_matrix = torch.full(labels.shape, mask_prob) * attention_mask
    else:
        # If no attention_mask is provided, tokens can be masked anywhere
        probability_matrix = torch.full(labels.shape, mask_prob)

    # Determine which tokens to mask
    masked_indices = torch.bernoulli(probability_matrix).bool()

    # Mask tokens
    inputs[masked_indices] = tokenizer.convert_tokens_to_ids("[MASK]")

    # Replace -100 in labels that we do not want to compute the loss for
    labels[~masked_indices] = -100

    return inputs, labels

In [None]:
masked_inputs, labels = _mask_tokens(input_ids, attention_mask=attention_mask)
masked_inputs, labels

(tensor([[ 103, 1037, 6302, 1997, 1037,  103,  102],
         [ 103,  103,  103, 1997, 1037,  103,  102]]),
 tensor([[ 101, -100, -100, -100, -100, 4937, -100],
         [ 101, 1037, 6302, -100, -100, 3899, -100]]))

In [None]:
output = model(
    # input_ids=input_ids,
    input_ids_masked=masked_inputs,
    attention_mask=attention_mask,
    token_type_ids=token_type_ids,
    mlm_labels=labels,
)

In [None]:
output.loss

tensor(9.1305, grad_fn=<AddBackward0>)

## Flava text model only

In [None]:
from PIL import Image

from transformers import BertTokenizer, FlavaTextModel

model = FlavaTextModel.from_pretrained("facebook/flava-full")
tokenizer = BertTokenizer.from_pretrained("facebook/flava-full")

inputs = tokenizer(
    text=["a photo of a dog", "a photo of a cat or so "],
    return_tensors="pt",
    padding="max_length",
)

mlm_head = torch.nn.Linear(model.config.hidden_size, model.config.vocab_size, bias=True)

outputs = model(**inputs)
text_embeddings = outputs.last_hidden_state

Some weights of the model checkpoint at facebook/flava-full were not used when initializing FlavaTextModel: ['flava.image_model.encoder.layer.5.attention.attention.query.weight', 'mlm_head.transform.dense.weight', 'flava.multimodal_model.encoder.layer.4.attention.attention.query.bias', 'flava.image_model.encoder.layer.1.attention.output.dense.bias', 'mmm_image_head.bias', 'flava.image_model.encoder.layer.8.attention.output.dense.bias', 'image_codebook.blocks.group_3.group.block_1.res_path.path.conv_2.weight', 'flava.image_model.encoder.layer.1.layernorm_after.weight', 'flava.image_model.encoder.layer.10.attention.attention.key.bias', 'image_codebook.blocks.group_1.group.block_1.res_path.path.conv_2.bias', 'image_codebook.blocks.group_4.group.block_2.res_path.path.conv_2.weight', 'image_codebook.blocks.group_1.group.block_1.res_path.path.conv_1.weight', 'flava.image_model.encoder.layer.4.attention.output.dense.weight', 'flava.image_model.encoder.layer.3.layernorm_after.bias', 'flava.ima

In [None]:
outputs.last_hidden_state.shape

torch.Size([2, 512, 768])

In [None]:
tokenizer.convert_tokens_to_ids("[MASK]")

103

In [None]:
# batch tokenize

from transformers import BertTokenizer, FlavaTextModel

model = FlavaTextModel.from_pretrained("facebook/flava-full")
tokenizer = BertTokenizer.from_pretrained("facebook/flava-full")

inputs = tokenizer(
    text=["a photo of a dog", "a photo of a cat or so "],
    return_tensors="pt",
    padding="max_length",
)

Some weights of the model checkpoint at facebook/flava-full were not used when initializing FlavaTextModel: ['flava.image_model.encoder.layer.5.attention.attention.query.weight', 'mlm_head.transform.dense.weight', 'flava.multimodal_model.encoder.layer.4.attention.attention.query.bias', 'flava.image_model.encoder.layer.1.attention.output.dense.bias', 'mmm_image_head.bias', 'flava.image_model.encoder.layer.8.attention.output.dense.bias', 'image_codebook.blocks.group_3.group.block_1.res_path.path.conv_2.weight', 'flava.image_model.encoder.layer.1.layernorm_after.weight', 'flava.image_model.encoder.layer.10.attention.attention.key.bias', 'image_codebook.blocks.group_1.group.block_1.res_path.path.conv_2.bias', 'image_codebook.blocks.group_4.group.block_2.res_path.path.conv_2.weight', 'image_codebook.blocks.group_1.group.block_1.res_path.path.conv_1.weight', 'flava.image_model.encoder.layer.4.attention.output.dense.weight', 'flava.image_model.encoder.layer.3.layernorm_after.bias', 'flava.ima

## Audio Spectogram Transformer

In [None]:
from transformers import (
    AutoFeatureExtractor,
    ASTForAudioClassification,
    ASTFeatureExtractor,
    ASTModel,
    ASTConfig,
)
from datasets import load_dataset
import torch
import numpy as np

dataset = load_dataset(
    "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
)
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

feature_extractor = ASTFeatureExtractor(
    sampling_rate=100, max_length=250, return_attention_mask=True
)
ast_config = ASTConfig()
model = ASTModel(ast_config)

# random audio signal
# Let's assume each audio sample is sampled at a rate of 16 kHz, i.e., 16000 samples per second
sampling_rate = 100

# Length of audio samples in seconds
lengths_in_seconds = [
    1,
    2,
    3,
    4,
]  # The four audio samples will be of 1, 2, 3, and 4 seconds respectively

# Creating the 4 audio samples
audio_samples = [
    np.random.randn(sampling_rate * length) for length in lengths_in_seconds
]
for sample in audio_samples:
    print(sample.shape)
inputs = feature_extractor(
    audio_samples, sampling_rate=sampling_rate, return_tensors="pt"
)

# with torch.no_grad():
#     logits = model(**inputs).logits

# predicted_class_ids = torch.argmax(logits, dim=-1).item()
# predicted_label = model.config.id2label[predicted_class_ids]
# predicted_label

# # compute loss - target_label is e.g. "down"
# target_label = model.config.id2label[0]
# inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
# loss = model(**inputs).loss
# round(loss.item(), 2)

Found cached dataset librispeech_asr_demo (/Users/lukas/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


(100,)
(200,)
(300,)
(400,)


In [None]:
inputs.keys()

dict_keys(['input_values', 'attention_mask'])

In [None]:
inputs.input_values.shape

torch.Size([4, 250, 128])

In [None]:
# flava tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/flava-full")

In [None]:
tokenizer

BertTokenizerFast(name_or_path='facebook/flava-full', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("facebook/flava-full")

In [None]:
# save as json
tokenizer.save_pretrained("flava-full-tokenizer")

('flava-full-tokenizer/tokenizer_config.json',
 'flava-full-tokenizer/special_tokens_map.json',
 'flava-full-tokenizer/vocab.txt',
 'flava-full-tokenizer/added_tokens.json',
 'flava-full-tokenizer/tokenizer.json')

## Whisper Feature extractor