In [2]:
from transformers import AutoFeatureExtractor, AutoModel
import torch
print(torch.cuda.is_available())  # Should return True
import torchaudio
from datasets import Dataset
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the feature extractor and model
feature_extractor = AutoFeatureExtractor.from_pretrained("ntu-spml/distilhubert")
model = AutoModel.from_pretrained("ntu-spml/distilhubert").to(device)


True


In [3]:
# import dataset
training_dataset = torchaudio.datasets.LIBRISPEECH(
    root="/scratch/pippalin2/jupyter/GMM-DistilHuBERT/data",    # where your LibriSpeech folder lives
    url="train-clean-100",       # this must match the subfolder name
    download=False            
)
waveform, sample_rate, transcript, _, _, _ = training_dataset[0]
print("Transcript:", transcript)

# Resample if needed: DHuBERT requires 16kHz
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)


# Wrap your torchaudio dataset into a generator format (for huggingface preprocessor)
def generator():
    for waveform, sample_rate, transcript, *_ in training_dataset:
        yield {
            "audio": {
                "array": waveform.squeeze(0).numpy(),  # remove channel dim if needed
                "sampling_rate": sample_rate
            },
            "text": transcript
        }

hf_dataset = Dataset.from_generator(generator)

Transcript: CHAPTER ONE MISSUS RACHEL LYNDE IS SURPRISED MISSUS RACHEL LYNDE LIVED JUST WHERE THE AVONLEA MAIN ROAD DIPPED DOWN INTO A LITTLE HOLLOW FRINGED WITH ALDERS AND LADIES EARDROPS AND TRAVERSED BY A BROOK


### Test DHuBERT

In [4]:
inputs = feature_extractor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
'''
{
  'input_values': tensor of shape [1, num_samples]
}
'''
# Run inference
with torch.no_grad(): # no need gradient since we are just testing
    outputs = model(**inputs)

# Output hidden state shape
print("Hidden state shape:", outputs.last_hidden_state.shape)
# 704 tokens (acoustic representation), each a 768-d vector

Hidden state shape: torch.Size([1, 704, 768])


### Try Fine-Tuning on ASR

In [6]:
from transformers import AutoProcessor
from transformers import HubertForCTC
import jiwer
from transformers import TrainingArguments, Trainer

In [8]:
processor = AutoProcessor.from_pretrained("ntu-spml/distilhubert")

# Preprocessing function
def prepare(example):
    audio = example["audio"]

    # Extract audio features
    example["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]

    # Encode labels (character-level)
    with processor.as_target_processor():
        example["labels"] = processor(example["text"]).input_ids

    return example

# Apply preprocessing
processed_ds = librispeech.map(prepare, remove_columns=librispeech.column_names)

OSError: Can't load tokenizer for 'ntu-spml/distilhubert'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'ntu-spml/distilhubert' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.

In [None]:
model = HubertForCTC.from_pretrained(
    "ntu-spml/distilhubert",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id
)

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions.argmax(-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = jiwer.wer(label_str, pred_str)
    return {"wer": wer}

In [None]:
training_args = TrainingArguments(
    output_dir="./distilhubert-asr",
    group_by_length=True,
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=3,
    fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2,
    gradient_checkpointing=True,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    data_collator=processor.feature_extractor.pad,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=processed_ds,
    tokenizer=processor
)

trainer.train()



In [None]:
from datasets import load_dataset

# Load test sample: here I already have training data downloaded
test_ds = load_dataset("librispeech_asr", "clean", split="test.clean[:1%]")
test_ds = test_ds.map(prepare)

# Predict
pred = trainer.predict(test_ds)
print("WER:", compute_metrics(pred))
