In [None]:
from datasets import load_dataset, DownloadConfig
from huggingface_hub import login

# Log in to Hugging Face Hub using your token
login(token=hf_token)

# Create a DownloadConfig object with the authentication token
download_config = DownloadConfig()

# Loading only 100 samples, passing the download_config to load_dataset
cv = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="train[:30]", download_config=download_config)

In [None]:
import torch
import torchaudio
from torchaudio.transforms import MelSpectrogram, Resample

mel_transform = MelSpectrogram(sample_rate=16000, n_mels=128)

def preprocess(batch):
    speech_array = batch["audio"]["array"]
    orig_sample_rate = batch["audio"]["sampling_rate"]

    # Convert to float32 tensor directly (not float64!)
    waveform = torch.tensor(speech_array, dtype=torch.float32).unsqueeze(0)  # shape: (1, num_samples)

    # Resample to 16kHz
    resampler = Resample(orig_freq=orig_sample_rate, new_freq=16000)
    audio_resampled = resampler(waveform)

    # Compute mel spectrogram
    mel_spec = mel_transform(audio_resampled).squeeze(0).transpose(0, 1)  # shape: (time, mel)

    # Add fields to the batch
    batch["input"] = mel_spec
    batch["target"] = batch["sentence"].lower()
    return batch

# Apply preprocessing
cv = cv.map(preprocess)




Map: 100%|██████████| 30/30 [00:13<00:00,  2.26 examples/s]


In [None]:
# Build vocab from dataset
vocab = list("abcdefghijklmnopqrstuvwxyz '")
char2idx = {c: i for i, c in enumerate(vocab)}
blank_idx = len(vocab)

def encode_text(text):
    return [char2idx[c] for c in text if c in char2idx]


In [None]:
import torch.nn as nn

class CNN_ASRModel(nn.Module):
    def __init__(self, input_dim=128, hidden_dim=256, output_dim=29):  # 28 chars + blank
        super(CNN_ASRModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(256, hidden_dim * 2, kernel_size=3, padding=1),
            nn.ReLU()
        )
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # [B, T, F] -> [B, F, T]
        x = self.cnn(x)
        x = x.permute(0, 2, 1)  # [B, F, T] -> [B, T, F]
        x = self.fc(x)
        return x.log_softmax(dim=-1)


In [None]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    # Convert inputs to tensors if they aren't already
    inputs = [torch.tensor(b["input"], dtype=torch.float32) if isinstance(b["input"], list) else b["input"] for b in batch]
    targets = [torch.tensor(encode_text(b["target"]), dtype=torch.int32) for b in batch]

    input_lengths = [i.shape[0] for i in inputs]
    target_lengths = [len(t) for t in targets]

    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    targets_concatenated = torch.cat(targets)

    return inputs_padded, targets_concatenated, torch.tensor(input_lengths), torch.tensor(target_lengths)


dataloader = DataLoader(cv, batch_size=2, shuffle=True, collate_fn=collate_fn)


In [None]:
model = CNN_ASRModel()
criterion = nn.CTCLoss(blank=blank_idx, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(200):
    total_loss = 0
    for inputs, targets, input_lens, target_lens in dataloader:
        logits = model(inputs)  # [B, T, C]
        log_probs = logits.permute(1, 0, 2)  # [T, B, C] for CTC
        loss = criterion(log_probs, targets, input_lens, target_lens)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    if epoch%10==0:
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 401.9208
Epoch 11, Loss: 47.0652
Epoch 21, Loss: 46.3440
Epoch 31, Loss: 45.3573
Epoch 41, Loss: 44.5494
Epoch 51, Loss: 43.2141
Epoch 61, Loss: 41.7673
Epoch 71, Loss: 38.8777
Epoch 81, Loss: 35.7319
Epoch 91, Loss: 33.4455
Epoch 101, Loss: 32.9738
Epoch 111, Loss: 26.1256
Epoch 121, Loss: 27.7236
Epoch 131, Loss: 21.9867
Epoch 141, Loss: 20.4134
Epoch 151, Loss: 18.3801
Epoch 161, Loss: 26.4442
Epoch 171, Loss: 17.7505
Epoch 181, Loss: 17.6687
Epoch 191, Loss: 15.9362


In [None]:
vocab.append(' ')
blank_idx = len(vocab) - 1

In [None]:
def greedy_decoder(logits):
    pred_ids = logits.argmax(dim=-1)
    decoded = []
    for pred in pred_ids:
        prev = -1
        text = ''
        for p in pred:
            p = p.item()
            if p != prev and p != blank_idx:
                text += vocab[p]
            prev = p
        decoded.append(text)
    return decoded

model.eval()
with torch.no_grad():
    sample = cv[0]


    input_tensor = torch.tensor(sample["input"], dtype=torch.float32).unsqueeze(0)  # [1, T, F]

    output = model(input_tensor)
    prediction = greedy_decoder(output)

    print("Predicted:", prediction[0])
    print("Actual   :", sample["target"])


Predicted: ts device has a cathode iside an as
Actual   : this device has a cathode inside an anode wire cage.


In [None]:
print(vocab)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', "'"]
