In [27]:
import torch
import torch.nn as nn
import torchaudio.transforms as T
import numpy as np
from datasets import load_dataset
from transformers import (
    Wav2Vec2ForCTC, Wav2Vec2Processor,
    HubertForCTC, 
    WhisperProcessor, WhisperForConditionalGeneration
)

In [28]:
print("1. Loading Real German Data (flozi00/asr-german-mixed-evals)...")

# Load dataset (streaming mode)
dataset_stream = load_dataset(
    "flozi00/asr-german-mixed-evals", 
    split="train", 
    streaming=True
)

1. Loading Real German Data (flozi00/asr-german-mixed-evals)...


In [29]:
def get_next_sample():
    sample = next(iter(dataset_stream))
    
    # 1. Get the raw list of numbers
    audio_data = sample["audio"]["array"]
    orig_sr = sample["audio"]["sampling_rate"]
    
    # 2. CRITICAL FIX: Convert List -> NumPy Array -> Tensor
    # The error happened because 'audio_data' was a list. 
    # np.array() fixes it.
    audio_tensor = torch.from_numpy(np.array(audio_data)).float()
    
    # 3. Manual Resampling to 16000 Hz (Standard for these models)
    if orig_sr != 16000:
        resampler = T.Resample(orig_sr, 16000)
        audio_tensor = resampler(audio_tensor)
    
    # 4. Extract Text
    text = sample["references"]
    
    return audio_tensor, text

In [30]:
audio_check, text_check = get_next_sample()
print(f"   Data Loaded Successfully.")
print(f"   Sample Text: {text_check}")
print(f"   Audio Shape: {audio_check.shape}")

   Data Loaded Successfully.
   Sample Text: Sie hätten jedenfalls sogleich die sicherste Kontrolle für meine Darstellung an ihr, auf der anderen Seite gewinnt aber diese vielleicht an Unbefangenheit und historischer Treue.
   Audio Shape: torch.Size([193280])


In [31]:
class BaselineCNN(nn.Module):
    def __init__(self, n_classes=32):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(64, n_classes, kernel_size=3, padding=1)
        )

    def forward(self, x):
        if x.ndim == 2:
            x = x.unsqueeze(1)
        return self.cnn(x).permute(0, 2, 1)

model_1 = BaselineCNN()

In [32]:
def manual_ctc_decode(logits, vocab):
    probs = torch.softmax(logits, dim=-1)
    best_path = torch.argmax(probs, dim=-1)[0]
    
    decoded_chars = []
    prev_idx = -1
    
    for idx in best_path:
        idx = idx.item()
        if idx != prev_idx and idx != 0:
            char = vocab.get(idx, "")
            decoded_chars.append(char)
        prev_idx = idx
        
    return "".join(decoded_chars)

In [33]:
print("   Loading HuBERT (German)...")
processor_3 = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model_3 = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")

   Loading HuBERT (German)...


In [34]:
print("   Loading Whisper (German)...")
processor_4 = WhisperProcessor.from_pretrained("openai/whisper-small")
model_4 = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

   Loading Whisper (German)...


In [35]:
print("   Loading Wav2Vec2 XLS-R (German)...")
w2v_id = "facebook/wav2vec2-large-xlsr-53-german"
processor_5 = Wav2Vec2Processor.from_pretrained(w2v_id)
model_5 = Wav2Vec2ForCTC.from_pretrained(w2v_id)

   Loading Wav2Vec2 XLS-R (German)...


In [38]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size=1, hidden_size=128, n_classes=32):
        super().__init__()
        self.rnn = nn.GRU(
            input_size=input_size, 
            hidden_size=hidden_size, 
            batch_first=True, 
            bidirectional=True
        )
        self.fc = nn.Linear(hidden_size * 2, n_classes)

    def forward(self, x):
        if x.ndim == 1:
            x = x.unsqueeze(0).unsqueeze(2)
        
        output, _ = self.rnn(x)
        return self.fc(output)
    
model_6 = SimpleRNN()