In [19]:
import torch
import torchaudio

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [20]:
import IPython

SPEECH_FILE = "corpus/clips/common_voice_pl_20547774.mp3" 
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
# bundle = torchaudio.pipelines.HUBERT_BASE
model = bundle.get_model().to(device)

In [21]:
waveform, sample_rate = torchaudio.load(SPEECH_FILE)
waveform = waveform.to(device)

if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

In [22]:
with torch.inference_mode():
    features, _ = model.extract_features(waveform)

In [23]:
with torch.inference_mode():
    emission, _ = model(waveform)

In [24]:
class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> str:
        """Given a sequence emission over labels, get the best path string
        Args:
          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.

        Returns:
          str: The resulting transcript
        """
        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        return "".join([self.labels[i] for i in indices])

In [26]:
decoder = GreedyCTCDecoder(labels=bundle.get_labels())
transcript = decoder(emission[0])

In [27]:
print(transcript)
IPython.display.Audio(SPEECH_FILE)

E|DO|HER|GO|ONASHU|SHOWN|SAMOTENE|ICIET|PIOSPAVODUSFALI|SAMATNASCI|
