### ASTModel 

In [33]:
import numpy as np
from transformers import AutoFeatureExtractor, ASTForAudioClassification
from datasets import load_dataset
import torch

dataset = load_dataset(
    "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
)
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

feature_extractor = AutoFeatureExtractor.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593"
)
model = ASTForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593"
)
print(np.array(dataset[0]["audio"]["array"]).shape)
# audio file is decoded on the fly
inputs = feature_extractor(
    dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt"
)
print(inputs)

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_ids = torch.argmax(logits, dim=-1).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label

# compute loss - target_label is e.g. "down"
target_label = model.config.id2label[0]
inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
loss = model(**inputs).loss
round(loss.item(), 2)

Found cached dataset librispeech_asr_demo (/Users/lukas/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_demo/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


(93680,)
{'input_values': tensor([[[-0.9894, -1.2776, -0.9066,  ..., -0.5855, -0.7328, -0.7346],
         [-0.9942, -1.2776, -0.9058,  ..., -0.6302, -0.7277, -0.8872],
         [-0.8979, -1.2094, -0.8326,  ..., -0.5787, -0.6236, -0.7860],
         ...,
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670]]])}


0.17

## My Reconstruction AST

In [1]:
from transformers import ASTConfig
import torch
from src.models.components.ast_reconstruction import MaskedASTModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
audio_root = "/Users/lukas/Desktop/Projects/MIT/data/peoples_speech/audio_debug"

In [3]:
from src.data.components.datasets import ASTAudioDataset

dataset = ASTAudioDataset(root_dir=audio_root, return_attention_mask=False)

Dataset found 5 flac files
Counting samples...
Counting took 0.5991921424865723 seconds
Dataset has 618 samples


In [4]:
# dataloader
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

In [5]:
batch = next(iter(dataloader))
batch.input_values.shape

torch.Size([8, 1024, 128])

In [6]:
config = ASTConfig()
model = MaskedASTModel(config)

In [8]:
outputs = model(
    **batch, mask_rate=0.5, return_loss=True, return_dict=True, loss_fn="l2"
)
outputs.loss

embedding_output.shape: torch.Size([8, 1214, 768])
masked_expanded.shape: torch.Size([8, 1214, 768])
masked_sequence_output.shape: torch.Size([3729408])
masked_original.shape: torch.Size([3729408])
reconstruction_loss: 1188.104736328125


tensor(1188.1047, grad_fn=<MseLossBackward0>)

In [15]:
outputs = model(
    **batch, mask_rate=0.5, return_loss=True, return_dict=True, loss_fn="l1"
)
outputs.loss

embedding_output.shape: torch.Size([8, 1214, 768])
masked_embedding_output.shape: torch.Size([8, 1214, 768])
mask shape torch.Size([8, 1214])
masked_expanded.shape: torch.Size([8, 1214, 768])
masked_sequence_output.shape: torch.Size([3729408])
masked_original.shape: torch.Size([3729408])
reconstruction_loss: 25.198837280273438


tensor(25.1988, grad_fn=<MeanBackward0>)

In [10]:
outputs = model(
    **batch, mask_rate=0.5, return_loss=True, return_dict=True, loss_fn="cos_sim"
)
outputs.loss

embedding_output.shape: torch.Size([8, 1214, 768])
masked_expanded.shape: torch.Size([8, 1214, 768])
masked_sequence_output.shape: torch.Size([3729408])
masked_original.shape: torch.Size([3729408])


AttributeError: 'MaskedASTModel' object has no attribute 'cos_sim_loss'