In [None]:
from datasets import load_dataset, DownloadConfig
from huggingface_hub import login

# Log in to Hugging Face Hub using your token
login(token=hf_token)

# Create a DownloadConfig object with the authentication token
download_config = DownloadConfig()

# Loading only 100 samples, passing the download_config to load_dataset
cv = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="train[:30]", download_config=download_config)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
import torchaudio
from torchaudio.transforms import MelSpectrogram, Resample

mel_transform = MelSpectrogram(sample_rate=16000, n_mels=128)

def preprocess(batch):
    speech_array = batch["audio"]["array"]
    orig_sample_rate = batch["audio"]["sampling_rate"]

    # Convert to float32 tensor directly (not float64!)
    waveform = torch.tensor(speech_array, dtype=torch.float32).unsqueeze(0)  # shape: (1, num_samples)

    # Resample to 16kHz
    resampler = Resample(orig_freq=orig_sample_rate, new_freq=16000)
    audio_resampled = resampler(waveform)

    # Compute mel spectrogram
    mel_spec = mel_transform(audio_resampled).squeeze(0).transpose(0, 1)  # shape: (time, mel)

    # Add fields to the batch
    batch["input"] = mel_spec
    batch["target"] = batch["sentence"].lower()
    return batch

# Apply preprocessing
cv = cv.map(preprocess)






In [None]:
# Build vocab from dataset
vocab = list("abcdefghijklmnopqrstuvwxyz '")
char2idx = {c: i for i, c in enumerate(vocab)}
blank_idx = len(vocab)

def encode_text(text):
    return [char2idx[c] for c in text if c in char2idx]


In [None]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [None]:
import torch
import torch.nn as nn
import math

class Transformer_ASRModel(nn.Module):
    def __init__(self, input_dim=128, hidden_dim=256, output_dim=29, num_heads=4, num_layers=4, max_seq_len=5000, dropout=0.1):
        super(Transformer_ASRModel, self).__init__()
        self.input_fc = nn.Linear(input_dim, hidden_dim)

        self.positional_encoding = nn.Parameter(torch.zeros(max_seq_len, hidden_dim))
        nn.init.normal_(self.positional_encoding, mean=0, std=math.sqrt(2.0 / hidden_dim))  # Scaled initialization
        self.encoder_layer = TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        self.fc_output = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.input_fc(x)

        seq_len = x.size(1)
        x = x + self.positional_encoding[:seq_len, :].unsqueeze(0)
        x = x * math.sqrt(self.positional_encoding.size(-1))
        x = self.transformer(x)
        x = self.fc_output(x)

        return x.log_softmax(dim=-1)


In [None]:
from torch.utils.data import DataLoader

def collate_fn(batch):

    inputs = [torch.tensor(b["input"], dtype=torch.float32) if isinstance(b["input"], list) else b["input"] for b in batch]
    targets = [torch.tensor(encode_text(b["target"]), dtype=torch.int32) for b in batch]

    input_lengths = [i.shape[0] for i in inputs]
    target_lengths = [len(t) for t in targets]

    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    targets_concatenated = torch.cat(targets)

    return inputs_padded, targets_concatenated, torch.tensor(input_lengths), torch.tensor(target_lengths)


dataloader = DataLoader(cv, batch_size=2, shuffle=True, collate_fn=collate_fn)


In [None]:
model = Transformer_ASRModel()
criterion = nn.CTCLoss(blank=blank_idx, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(101):
    total_loss = 0
    for inputs, targets, input_lens, target_lens in dataloader:
        logits = model(inputs)  # [B, T, C]
        log_probs = logits.permute(1, 0, 2)  # [T, B, C] for CTC
        loss = criterion(log_probs, targets, input_lens, target_lens)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if epoch%10==0:
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 144.5386
Epoch 11, Loss: 122.5357
Epoch 21, Loss: 70.7645
Epoch 31, Loss: 70.9828
Epoch 41, Loss: 70.9625
Epoch 51, Loss: 70.9639
Epoch 61, Loss: 71.0234
Epoch 71, Loss: 70.8023
Epoch 81, Loss: 70.9369
Epoch 91, Loss: 70.7535
Epoch 101, Loss: 71.0757


In [None]:
vocab.append(' ')  # or whatever you want to use to represent "blank"
blank_idx = len(vocab) - 1

In [None]:
def greedy_decoder(logits):
    # Get the predicted indices with highest probabilities
    pred_ids = logits.argmax(dim=-1)  # Shape [B, T]

    decoded = []
    for pred in pred_ids:
        prev = -1  # Track the previous token
        text = ''
        for p in pred:
            p = p.item()
            # Avoid repeated tokens and blanks
            if p != prev and p != blank_idx:
                text += vocab[p]
            prev = p
        decoded.append(text)
    return decoded

# Run on a test sample
model.eval()
with torch.no_grad():
    sample = cv[0]

    input_tensor = torch.tensor(sample["input"], dtype=torch.float32).unsqueeze(0)

    output = model(input_tensor)

    prediction = greedy_decoder(output)

    print("Predicted:", prediction[0])
    print("Actual   :", sample["target"])


Predicted: ts dece h a c o de i ide a  ane
Actual   : this device has a cathode inside an anode wire cage.


In [None]:
print(vocab)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', "'"]
