# Use pretrained model for test predictions

In [1]:
!pip3 install kenlm
!pip3 install -r requirements.txt

Collecting kenlm
  Downloading kenlm-0.2.0.tar.gz (427 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/427.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/427.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.4/427.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: kenlm
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.2.0-cp310-cp310-linux_x86_64.whl size=3184433 sha256=215a2ef667a2a91f0b788130383a82a1cc6465eca0103f0f8e73979982487771
  Stored in directory: /root/.cache/pip/wheels/fd/80/e0/18f4148e863fb137bd87e21ee2bf423b81b3ed6989dab95135
Success

In [2]:
from huggingface_hub import login
from utils import WRITE_ACCESS_TOKEN

login(WRITE_ACCESS_TOKEN)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import os
import torch
import pandas as pd

from load_fleurs_nl import load_fleurs_nl
from load_fleurs_zu import load_fleurs_zu

from datasets import Audio, load_dataset
from utils import SR, remove_special_characters_batch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and repo
# dataset_name = input("Dataset name: ")
# repo_name = input("Repo name: ")
dataset_name = "fleurs_zu"
repo_name = "lucas-meyer/xls-r-fleurs_zu-run4"

# DL data
if dataset_name == "asr_af" or dataset_name == "asr_xh":
    val_set = load_dataset("lucas-meyer/" + dataset_name, split="validation")
    test_set = load_dataset("lucas-meyer/" + dataset_name, split="test")
    val_set = val_set.cast_column("audio", Audio(sampling_rate=SR))
    test_set = test_set.cast_column("audio", Audio(sampling_rate=SR))
else:
    dataset_dir = os.path.join("data", "speech_data", dataset_name)
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir, exist_ok=True)
        # Create dataset by combining 3 datasets into an audiofolder
        csv_entries = []
        if (dataset_name == "fleurs_nl"):
            csv_entries += load_fleurs_nl(write_audio=True)
        elif (dataset_name == "fleurs_zu"):
            csv_entries += load_fleurs_zu(write_audio=True)
        metadata = pd.DataFrame(csv_entries, columns=['file_name', 'transcription'])
        metadata.to_csv(path_or_buf=os.path.join(dataset_dir, "metadata.csv"), sep=",", index=False)

        # Load dataset from audiofolder that you created
        dataset = load_dataset("audiofolder", data_dir=dataset_dir)
    else:
        # Load dataset from audiofolder that you created
        dataset = load_dataset("audiofolder", data_dir=dataset_dir)

    # Downsample audio to SR = 16000 and init train/val/test sets
    val_set = dataset["validation"].cast_column("audio", Audio(sampling_rate=SR)).rename_column("transcription", "sentence")
    test_set = dataset["test"].cast_column("audio", Audio(sampling_rate=SR)).rename_column("transcription", "sentence")
    val_set = val_set.map(remove_special_characters_batch)
    test_set = test_set.map(remove_special_characters_batch)
    # torch.cuda.empty_cache()

# DL model
model_basic = Wav2Vec2ForCTC.from_pretrained(repo_name).to(device)
processor_basic = Wav2Vec2Processor.from_pretrained(repo_name)

# DL model with LM
# repo_name_with_LM = input("Repo with LM: ")
repo_name_with_LM = "none"
if repo_name_with_LM == "none":
    model_with_LM = None
    processor_with_LM = None
else:
    model_with_LM = Wav2Vec2ForCTC.from_pretrained(repo_name_with_LM).to(device)
    processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(repo_name_with_LM)

Resolving data files:   0%|          | 0/2059 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/257 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/630 [00:00<?, ?it/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

In [5]:
def predict_transcription(audio_sample, model, processor):
    # Get model inputs
    inputs = processor(
        audio_sample["audio"]["array"],
        sampling_rate=audio_sample["audio"]["sampling_rate"],
        return_tensors="pt",
        padding=True,
    ).to(device)

    # Pass inputs into model to get logits
    with torch.no_grad():
        logits = model(**inputs).logits

    # Decode logits to get predicted transcription
    if isinstance(processor, Wav2Vec2ProcessorWithLM):
        pred = processor.batch_decode(logits.cpu().numpy()).text
        pred = pred[0].lower()
    else:
        predicted_ids = torch.argmax(logits, dim=-1)
        pred = processor.batch_decode(predicted_ids)
        pred = pred[0].lower()

    return pred

In [6]:
for i in range(0):
    print(f"Test {i}:")
    # Get true transcription
    true_transcription = test_set[i]["transcription"].lower()
    print(f"  - true (.....): {true_transcription}")
    # Get prediction
    pred_basic = predict_transcription(test_set[i], model_basic, processor_basic)
    print(f"  - pred (.....): {pred_basic}")
    # Get prediction with LM
    if model_with_LM is not None:
        print(f"  - pred (w/ LM): {pred_with_LM}\n")
        pred_with_LM = predict_transcription(test_set[i], model_with_LM, processor_with_LM)

In [None]:
from evaluate import load

print(f"Results: {repo_name}", end="\n\n")

# --------------------------------------------------------------
# VALIDATION SET SCORE
# --------------------------------------------------------------
true_transcriptions = []
model_predictions = []
model_with_LM_predictions = []

for i in range(len(val_set)):
    pred_basic = predict_transcription(val_set[i], model_basic, processor_basic)
    model_predictions.append(pred_basic)
    if model_with_LM is not None:
        pred_with_LM = predict_transcription(val_set[i], model_with_LM, processor_with_LM)
        model_with_LM_predictions.append(pred_with_LM)

    if "fleurs" in dataset_name:
        true_transcriptions.append(val_set[i]["sentence"].lower())
    else:
        true_transcriptions.append(val_set[i]["transcription"].lower())

    # Print progress
    print(f"\r{i+1}/{len(val_set)}\t\t", end="")
print("")

wer = load("wer")
wer_score_model = wer.compute(predictions=model_predictions, references=true_transcriptions)
print(f"Validation score: {wer_score_model}", end="\n\n")

if model_with_LM is not None:
    wer_score_model_with_LM = wer.compute(predictions=model_with_LM_predictions, references=true_transcriptions)
    print(f"Validation score: {wer_score_model_with_LM}", end="\n\n")

# --------------------------------------------------------------
# TEST SET SCORE
# --------------------------------------------------------------
true_transcriptions = []
model_predictions = []
model_with_LM_predictions = []

for i in range(len(test_set)):
    pred_basic = predict_transcription(test_set[i], model_basic, processor_basic)
    model_predictions.append(pred_basic)
    if model_with_LM is not None:
        pred_with_LM = predict_transcription(test_set[i], model_with_LM, processor_with_LM)
        model_with_LM_predictions.append(pred_with_LM)

    if "fleurs" in dataset_name:
        true_transcriptions.append(test_set[i]["sentence"].lower())
    else:
        true_transcriptions.append(test_set[i]["transcription"].lower())

    # Print progress
    print(f"\r{i+1}/{len(test_set)}\t\t", end="")
print("")

wer = load("wer")
wer_score_model = wer.compute(predictions=model_predictions, references=true_transcriptions)
print(f"Test score: {wer_score_model}", end="\n\n")

if model_with_LM is not None:
    wer_score_model_with_LM = wer.compute(predictions=model_with_LM_predictions, references=true_transcriptions)
    print(f"Test score: {wer_score_model_with_LM}", end="\n\n")

Results: none

153/257		