In [6]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch
 
# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
    
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")


Some weights of the model checkpoint at facebook/wav2vec2-xlsr-53-espeak-cv-ft were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xlsr-53-espeak-cv-ft and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should prob

In [13]:
# tokenize
input_values = processor(ds[6]["audio"]["array"], return_tensors="pt", sampling_rate=16000).input_values

# retrieve logits
with torch.no_grad():
  logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(transcription)



['ð ə b ʌ z ɚ z w ɚ t ɹ ɪ ɡ ɚ d ɪ z m ʌ s ʊ l z ɪ n t ə k ə m p l iː t ɹ iː l æ k s eɪ ʃ ə n']


In [10]:
print(f"Text: {ds[6]['text']}")
print(len(ds[6]["audio"]["array"]))

Text: THE BUZZER'S WHIRR TRIGGERED HIS MUSCLES INTO COMPLETE RELAXATION
78880


In [22]:
import os, sys, csv, json
import random
from collections import defaultdict
from pandas import DataFrame as df
import torch
import torchaudio
from datasets import Dataset


# wav_file = "/exp/nbafna/data/l2_arctic/l2arctic_release_v5/ABA/wav/arctic_a0001.wav"

# Test on EdAcc dataset
def stm_reader(stm_path):
    stm_data = []
    with open(stm_path, "r") as f:
        for line in f:
            line_data = {}
            parts = line.split()
            line_data["audio_file"] = parts[0]
            line_data["channel"] = parts[1]
            line_data["speaker"] = parts[2]
            line_data["start_time"] = float(parts[3])
            line_data["end_time"] = float(parts[4])
            line_data["label"] = parts[5]
            line_data["transcript"] = " ".join(parts[6:])
            stm_data.append(line_data)
    return stm_data

def load_edacc(num_samples = None):
    test_stm_path = "/exp/nbafna/data/edacc/edacc_v1.0/test/stm"
    dev_stm_path = "/exp/nbafna/data/edacc/edacc_v1.0/dev/stm"
    data_path = "/exp/nbafna/data/edacc/edacc_v1.0/data"

    audio_files = {}
    resampler = torchaudio.transforms.Resample(orig_freq=32000, new_freq=16000)
    for audio_file in os.listdir(data_path):
        audio_files[audio_file[:-4]], sr = torchaudio.load(os.path.join(data_path, audio_file))
        audio_files[audio_file[:-4]] = resampler(audio_files[audio_file[:-4]])
        # audio_files[audio_file[:-4]] = language_id.load_audio(os.path.join(data_path, audio_file))
        sr = 16000 # This is the sampling rate for the language_id model, audio is normalized when loaded
    print(f"Loaded {len(audio_files)} audio files")

    speaker2lang = {}
    # linguistic_background = "/exp/nbafna/data/edacc/edacc_v1.0/linguistic_background.csv"
    # with open(linguistic_background, "r") as f:
    #     reader = csv.reader(f)
    #     for row in reader:
    #         speaker2lang[row[1]] = row[12]
    participant2accent_path = "/exp/nbafna/data/edacc/edacc_v1.0/participant2accent.json"
    with open(participant2accent_path, "r") as f:
        speaker2lang = json.load(f)
    print(f"Recorded {len(speaker2lang)} speakers")

    all_data = []
    stm_data = stm_reader(test_stm_path) + stm_reader(dev_stm_path)
    for line in stm_data[:100]:
        audio_file = line["audio_file"]
        signal = audio_files[audio_file]
        signal = signal.squeeze().numpy()
        segment = signal[int(line["start_time"]*sr):int(line["end_time"]*sr)]
        transcript = line["transcript"]

        if "IGNORE_TIME_SEGMENT_IN_SCORING" in transcript:
            continue
        # Filter out signals with less than 6 seconds
        if segment.shape[0] < 6*16000:
            continue
        # Chunk into uniform windows of K seconds
        K = 6
        for i in range(0, len(segment), K*16000):
            if i+K*16000 > len(segment):
                break
            all_data.append({"signal": segment[i:i+K*16000], \
                             "lang": speaker2lang[line["speaker"]],\
                                "transcript": transcript})

        # segment = segment[:10*16000]
        # lang = speaker2lang[line["speaker"]]
        # all_data.append({"signal": segment, "lang": lang})

        # if len(all_data)%10 == 0:
        #     print(f"Printing out sample")
        #     print(f"Lang: {lang}")
        #     print(f"Speaker: {line['speaker']}")
        #     print(f"Start and end times: {line['start_time']}, {line['end_time']}")
        #     print(f"Expected length: {int((line['end_time']-line['start_time'])*sr)}")
        #     print(f"Length of audio: {segment.shape}")
    
    print(f"Loaded {len(all_data)} segments")
    print(f"Sample: {all_data[0]}")
    all_langs = set([f["lang"] for f in all_data])
    print(f"Languages: {all_langs}")

    if num_samples is not None:
        all_data = random.sample(all_data, min(len(all_data), num_samples))
    all_data = {"signal": [f["signal"] for f in all_data], \
                "lang": [f["lang"] for f in all_data],\
                    "transcript": [f["transcript"] for f in all_data]}
    
    return Dataset.from_dict(all_data)

                
dataset = load_edacc()



Loaded 76 audio files
Recorded 122 speakers
Loaded 28 segments
Sample: {'signal': array([-0.00042685, -0.00044512, -0.00030128, ...,  0.02896704,
        0.01170549, -0.03144089], dtype=float32), 'lang': 'scottish', 'transcript': "RIGHT AND NOW I THINK WE JUST HAVE TO CONVERSE FOR FIFTEEN MINUTES UM LET'S START WITH THE FIRST ONE WHEN YOU WERE A KID WHAT KINDS OF GAMES DID YOU PLAY IS THERE ONE YOU REMEMBER WELL LIKE YOU CAN GO FIRST 'CAUSE YOU HAD A GAME"}
Languages: {'scottish'}


In [23]:
for data in dataset.select(range(20)):
    if "IGNORE_TIME_SEGMENT_IN_SCORING" in data["transcript"]:
        continue
    print(data["transcript"])
    input_values = processor(data["signal"], return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    print(transcription)
    print("\n")


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


RIGHT AND NOW I THINK WE JUST HAVE TO CONVERSE FOR FIFTEEN MINUTES UM LET'S START WITH THE FIRST ONE WHEN YOU WERE A KID WHAT KINDS OF GAMES DID YOU PLAY IS THERE ONE YOU REMEMBER WELL LIKE YOU CAN GO FIRST 'CAUSE YOU HAD A GAME


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['ɹ aɪ t æ n d n ɑː aɪ θ ɪ ŋ k w iː dʒ ʌ s t h æ f t ə k ə n v ɚ s t ɚ f ɪ f t iː n m ɪ n ɪ t ts m ɐ s']


RIGHT AND NOW I THINK WE JUST HAVE TO CONVERSE FOR FIFTEEN MINUTES UM LET'S START WITH THE FIRST ONE WHEN YOU WERE A KID WHAT KINDS OF GAMES DID YOU PLAY IS THERE ONE YOU REMEMBER WELL LIKE YOU CAN GO FIRST 'CAUSE YOU HAD A GAME


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['s ɑː w ʌ z f ɚ s t w ʌ n w ɛ n uː ɹ ɪ k ɪ d w ɔ t k aɪ m z ʌ v ɡ eɪ m z d ɪ d j uː p l eɪ ɪ z ð ə w ʌ n j uː ɹ m ɛ m b ɚ w ɛ l aɪ j ɪ ŋ ɡ oʊ f ɚ']


WHICH IS JUST THE WHICH IS JUST A UH BASICALLY YOU HAVE TWO ENDS OF A PITCH AND YOUR GOAL'S TO REACH THE OTHER END OF THE PITCH WITHOUT BEING TAGGED BY SOMEONE IN THE MIDDLE AND NO RULES APPLY PEOPLE WERE BEATEN UP BLOOD EVERYWHERE OH GOOD TIMES


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['w ɪ tʃ ɪ z dʒ ʌ s t ə æ b eɪ s k ʊ j h æ v t uː ɛ n z ʌ v ɐ p ɪ tʃ æ n d j uː ɡ oː l z s t ɹ uː v iː tʃ ð iː ʌ ð ɚ ɹ ɛ n t ə ð ə p ɪ']


WHICH IS JUST THE WHICH IS JUST A UH BASICALLY YOU HAVE TWO ENDS OF A PITCH AND YOUR GOAL'S TO REACH THE OTHER END OF THE PITCH WITHOUT BEING TAGGED BY SOMEONE IN THE MIDDLE AND NO RULES APPLY PEOPLE WERE BEATEN UP BLOOD EVERYWHERE OH GOOD TIMES


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['dʒ w ɪ ð aʊ t b iː ɪ ŋ t æ k t b aɪ s ʌ m w ʌ n ɪ n ð ə m ɪ d ʊ æ n d n oʊ ɹ uː l z ʌ p l aɪ p iː p əl w ɚ']


THAT WOULD NOT GO WELL UM SIMILAR TO THE SIMILARLY TO THAT I PLAYED LOTS OF SHARKS AND FISHES IN UM LIKE


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['h aɪ w ʊ l n ɔ t ɡ oː w ɛ l æ m s ɪ m ɪ l t ɪ ɪ t s ɪ m ɪ ə l i t ə ð æ t ɐ p d l ʌ s ə s ʃ ɑː k s ə n f ɪ ʃ ɪ z']


WE'D UH THE IDEA IT'S ESSENTIALLY THE SAME AS BULLDOG YOU HAVE TO TRY AND RUN TO THE END OF THE FIELD THERE'S ONE PERSON WHO'S THIS SHARK OR MAYBE TWO PEOPLE AND THEY CAN MOVE ANY DIRECTION AND IF THEY TAKE YOU YOU BECOME SEAWEED AND YOU CAN'T MOVE YOUR FEET BUT YOU CAN LIKE LEAN OUT TO CATCH PEOPLE


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['ð ɪ aɪ d iː ɾ ɪ z ɪ z ɪ s ɛ n ʃ l i ð ə s eɪ m ə z b ʊ ɡ oʊ k æ t ə d ɹ aɪ n m ə n d t ɪ n ð ə f iː l d ð ɛ ɹ ɪ z w ʌ n p ɚ s ə n h uː z ð ɪ ʃ t']


WE'D UH THE IDEA IT'S ESSENTIALLY THE SAME AS BULLDOG YOU HAVE TO TRY AND RUN TO THE END OF THE FIELD THERE'S ONE PERSON WHO'S THIS SHARK OR MAYBE TWO PEOPLE AND THEY CAN MOVE ANY DIRECTION AND IF THEY TAKE YOU YOU BECOME SEAWEED AND YOU CAN'T MOVE YOUR FEET BUT YOU CAN LIKE LEAN OUT TO CATCH PEOPLE


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['ʃ ɑː k k ɚ m ɪ v iː t uː p iː p əl æ n ð eɪ k ə n m uː v ɛ n i d ə ɹ ɹ ɛ k ʃ ə n æ n ɪ f ð t eɪ k j uː j uː b ɪ k ʌ m s iː w iː d æ n d j uː k']


AND THEN IF THE GAME WAS DRAGGING ON THE SEAWEED WOULD BECOME CRABS SO THEY CAN MOVE SIDE TO SIDE BUT NOT FORWARDS AND BACKWARDS SO IT WAS REALLY IT WAS QUITE QUITE A LITTLE <OVERLAP>


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['æ n d ð ɛ n n ɪ f ð ə ɡ eɪ m w ʌ z d ɹ æ ɡ ə n ɔ n ð ə s iː w iː d ə b ɪ k ʌ m k ɹ æ b z s l eɪ k ə d m uː v s aɪ d t ə s']


IT WAS VERY FUN THOUGH I LOVED PLAYING IT I ALWAYS WOULD LAST TO THE LAST LIKE TWO OR THREE THAT'D BE SO GOOD UM


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['ɪ t w ʌ z v ɛ ɹ i f ʌ n n oʊ ɐ l ʌ v l ɪ ŋ h ɛ ɾ oː w ɪ z b oʊ t l æ t ɪ l æ s t l aɪ k t uː ɚ t ɹ i ɛ v i']


WE DID A FEW TIMES WHERE IT WOULD BE LIKE UM LIKE A BIB YOU HAD TO PULL OUT SOMETIMES AS WELL LIKE YOU TOOK THE BIB IN


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['d ɪ d ə f j uː t aɪ m z w ɚ ɹ ɪ ɾ ʊ d b iː l aɪ k ɐ l aɪ k ɐ b ɪ b j a t ə p ʊ l aʊ t s ʌ m t aɪ m z ə z w ɛ l']


YOUR SHORTS AND YOU HAD TO LIKE YANK IT OUT IN TIME FOR IT TO COUNT AS A CATCH AND I THINK THAT WAS WHEN WE WERE PLAYING TOUCH RUGBY SO WE COULD PRACTICE YOU KNOW TOUCH TACKLES


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['j ɚ ʃ oː t s æ n d h j iː h æ z l aɪ k j æ ŋ k ɪ aʊ ɪ n t aɪ m f oʊ ɹ ɪ t t ə k aʊ n ɪ z ə k æ tʃ n aɪ ð æ w ʌ z h uː m uː']


AND THEN THERE WAS LIKE A PIG STYLE WHERE YOU LIKE CHASE THE PERSON WITH THE BALL AND JUST THROW IT AT THEM WHICH LIKE


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['æ n ð ɛ n ð ɛ w ʌ z l aɪ k ɐ t ɪ ɡ s t aɪ l w eɪ j uː l aɪ k tʃ eɪ s ə p ɚ s ə n w ɪ ð ð ə b ɑː l æ n dʒ ʌ s t r oː']


I I SUPPOSE IF PEOPLE WERE JUST REALLY BAD AT THROWING BECAUSE THEY WERE REALLY QUITE YOUNG I MEAN WE WERE ALL REALLY YOUNG BUT IT'S STILL VERY WEIRD


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['aɪ s aɪ s ə p oʊ z ð ə p iː p oʊ d ɪ s ɹ ɪ l i b aɪ ɾ æ ð ɹ ɪ ŋ k s ɪ ɹ ɪ l i k w aɪ t ɪ j ʌ ŋ ɚ ɹ ɪ l i j ʌ ŋ b ʌ t ɪ t']


YEAH BIG BIG HALL FOR THE SIZE I REMEMBER GOING BACK THERE WHEN I WAS A LOT OLDER AND I REMEMBER SEEING THE BASKETBALL HOOPS IN THE HALL AND I COULD LIKE DUNK ON THEM WHICH WAS SO WEIRD BECAUSE BASKETBALL HOOPS ARE SUPPOSED TO BE LIKE TEN FEET TALL AND AT THE


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['j ɛ b ɪ ɡ b ɪ ɡ h ɔ l ɔ f ɚ ð ə s aɪ z aɪ m ɛ m b ɚ ɡ oː ɪ ŋ b æ k ð ɛ ɹ w ɛ n aɪ w ʌ z ɐ l ɑː t oː l d ɚ']


YEAH BIG BIG HALL FOR THE SIZE I REMEMBER GOING BACK THERE WHEN I WAS A LOT OLDER AND I REMEMBER SEEING THE BASKETBALL HOOPS IN THE HALL AND I COULD LIKE DUNK ON THEM WHICH WAS SO WEIRD BECAUSE BASKETBALL HOOPS ARE SUPPOSED TO BE LIKE TEN FEET TALL AND AT THE


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['æ n d aɪ n ɛ v ɚ s iː n ð ə b æ s k w oː l l h y p s ɪ n ð ə h ɑː l æ n aɪ k ʊ d l eɪ k d ɔ ŋ k ɔ n ð ɛ m w ɪ tʃ ɪ z']


PRIMARY SCHOOL IT WASN'T EVEN SEVEN FEET SO IT WAS VERY FUN TO GO BACK THERE THEN MESS ABOUT THEREAFTER I DID SOME UM COACHING


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['p aɪ m ɪ s k uː l ɪ t w ʌ z ə n t iː v ə n s ɛ v ə n f iː t s ɪ t w ʌ z v ɛ ɹ i f ɔ n t ɪ ɡ oː b æ k ð ɛ']


THE IDEA IS THERE'S THIS MYSTERY GANG THESE FIVE PEOPLE AND ONE TALKING DOG AND THEY'D GO AND SOLVE MYSTERIES AND THERE'D BE LIKE SOME PERSON DRESSED UP AS A MONSTER OR GHOST TERRORIZING SOME NEIGHBORHOOD OR SOMETHING AND THEY'D UM TRY AND FIND OUT WHO IT WAS THEY'D GET SOMEONE SOMEONE WOULD CALL THEM IN LIKE HELP WE NEED TO SOLVE THIS AND THEY'D BE LIKE OKAY WE'RE HERE TO HELP AND AT THE END THEY'D ALWAYS REVEAL THE PERSON THEY HAD TO CATCH THEM IN A BIG ELABORATE PLAN TAKE THEIR MASK OFF AND IT WOULD BE ONE OF THE PEOPLE THAT HAD CALLED THEM FOR HELP SOMETHING LIKE THAT IT WAS ALWAYS QUITE FUNNY AND IT WAS ALWAYS QUITE SILLY IT HAD A VERY SLAP STICK WAY OF RUNNING AND EATING AS WELL IT WAS A VERY ENJOYABLE TV SHOW


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['eː z ð ɛ z z ð ɪ s k w ʌ ə ŋ k oː m ɪ s t ɚ ɹ i ɡ æ n d iː z f aɪ v p iː p ʊ æ n d w ʌ n']


THE IDEA IS THERE'S THIS MYSTERY GANG THESE FIVE PEOPLE AND ONE TALKING DOG AND THEY'D GO AND SOLVE MYSTERIES AND THERE'D BE LIKE SOME PERSON DRESSED UP AS A MONSTER OR GHOST TERRORIZING SOME NEIGHBORHOOD OR SOMETHING AND THEY'D UM TRY AND FIND OUT WHO IT WAS THEY'D GET SOMEONE SOMEONE WOULD CALL THEM IN LIKE HELP WE NEED TO SOLVE THIS AND THEY'D BE LIKE OKAY WE'RE HERE TO HELP AND AT THE END THEY'D ALWAYS REVEAL THE PERSON THEY HAD TO CATCH THEM IN A BIG ELABORATE PLAN TAKE THEIR MASK OFF AND IT WOULD BE ONE OF THE PEOPLE THAT HAD CALLED THEM FOR HELP SOMETHING LIKE THAT IT WAS ALWAYS QUITE FUNNY AND IT WAS ALWAYS QUITE SILLY IT HAD A VERY SLAP STICK WAY OF RUNNING AND EATING AS WELL IT WAS A VERY ENJOYABLE TV SHOW


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['t ɔ k ɪ ŋ d ɔ ɡ æ n ð eɪ d ɡ oː ə n d s ɔ l v m ɪ s t ɚ ɹ i z æ n d ɛ d b iː l aɪ k s']


THE IDEA IS THERE'S THIS MYSTERY GANG THESE FIVE PEOPLE AND ONE TALKING DOG AND THEY'D GO AND SOLVE MYSTERIES AND THERE'D BE LIKE SOME PERSON DRESSED UP AS A MONSTER OR GHOST TERRORIZING SOME NEIGHBORHOOD OR SOMETHING AND THEY'D UM TRY AND FIND OUT WHO IT WAS THEY'D GET SOMEONE SOMEONE WOULD CALL THEM IN LIKE HELP WE NEED TO SOLVE THIS AND THEY'D BE LIKE OKAY WE'RE HERE TO HELP AND AT THE END THEY'D ALWAYS REVEAL THE PERSON THEY HAD TO CATCH THEM IN A BIG ELABORATE PLAN TAKE THEIR MASK OFF AND IT WOULD BE ONE OF THE PEOPLE THAT HAD CALLED THEM FOR HELP SOMETHING LIKE THAT IT WAS ALWAYS QUITE FUNNY AND IT WAS ALWAYS QUITE SILLY IT HAD A VERY SLAP STICK WAY OF RUNNING AND EATING AS WELL IT WAS A VERY ENJOYABLE TV SHOW


It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


['ʌ m p ɚ s ə n d ɹ ɛ s t ʌ p ʌ z ɐ m ɔ n s t ɚ ɚ ɡ oʊ s t t ɛ ɹ ɚ ɹ aɪ z ɪ ŋ s ʌ m n eɪ b ɚ h ʊ d ɚ s ʌ m θ ɪ ŋ']


THE IDEA IS THERE'S THIS MYSTERY GANG THESE FIVE PEOPLE AND ONE TALKING DOG AND THEY'D GO AND SOLVE MYSTERIES AND THERE'D BE LIKE SOME PERSON DRESSED UP AS A MONSTER OR GHOST TERRORIZING SOME NEIGHBORHOOD OR SOMETHING AND THEY'D UM TRY AND FIND OUT WHO IT WAS THEY'D GET SOMEONE SOMEONE WOULD CALL THEM IN LIKE HELP WE NEED TO SOLVE THIS AND THEY'D BE LIKE OKAY WE'RE HERE TO HELP AND AT THE END THEY'D ALWAYS REVEAL THE PERSON THEY HAD TO CATCH THEM IN A BIG ELABORATE PLAN TAKE THEIR MASK OFF AND IT WOULD BE ONE OF THE PEOPLE THAT HAD CALLED THEM FOR HELP SOMETHING LIKE THAT IT WAS ALWAYS QUITE FUNNY AND IT WAS ALWAYS QUITE SILLY IT HAD A VERY SLAP STICK WAY OF RUNNING AND EATING AS WELL IT WAS A VERY ENJOYABLE TV SHOW
['æ n d ð eɪ d æ t aɪ n f aɪ d ɑː t h uː ɪ t w ɔ z ɪ t ɡ ɛ t s ʌ m w ʌ n t n t ə k ɑː l ɪ m ɪ n l aɪ k h æ l p n iː d z ɪ']




In [18]:
# TEST ON CV

def load_cv(per_accent = None):
    files = {}
    accents = {'indian', 'singapore', 'scotland', 'us', 'canada', 'wales', 'england', 'philippines', 'african', 'newzealand', 'ireland', 'malaysia', 'hongkong', 'australia'}
    for line in open("/export/common/data/corpora/ASR/commonvoice/en/train.tsv"):
        if len(line.strip().split("\t")) < 8:
            continue
        audio, transcript, accent = line.strip().split("\t")[1], line.strip().split("\t")[2], line.strip().split("\t")[7]
        if accent not in accents:
            continue
        if accent not in files:
            files[accent] = []
        files[accent].append((audio, transcript))

    for accent in accents:
        if accent not in files:
            continue
        random.shuffle(files[accent])
        files[accent] = files[accent][:per_accent]
        
    # print(files)
    print("Loading audio files...")
    data = []
    resampler = torchaudio.transforms.Resample(48000, 16000)
    clips_folder = "/export/common/data/corpora/ASR/commonvoice/en/clips/"
    for accent in files:
        for (audio, transcript) in files[accent]:
            print(f"Loading {audio}...")
            signal, sr = torchaudio.load(os.path.join(clips_folder, audio))
            if sr != 16000:
                # Resample
                signal = resampler(signal)

            signal = signal.squeeze().numpy()
            data.append({"signal": signal, \
                             "lang": accent, \
                                "filename": os.path.join(clips_folder, audio),\
                                    "transcript": transcript})
            
            # signal = language_id.load_audio(os.path.join(clips_folder, audio))
            # K = signal.shape[0] // 16000
            # if signal.shape[0] < 10*16000:
            #     print(f"Signal too short: {signal.shape[0]}")
            #     continue
            
            # for i in range(0, len(signal), K*16000):
            #     if i+K*16000 > len(signal):
            #         break
            #     data.append({"signal": signal[i:i+K*16000], \
            #                  "lang": accent, \
            #                     "filename": os.path.join(clips_folder, audio),\
            #                         "transcript": transcript})
            

    data = {"signal": [f["signal"] for f in data], \
            "lang": [f["lang"] for f in data], \
                "filename": [f["filename"] for f in data],
                "transcript": [f["transcript"] for f in data]}
    return Dataset.from_dict(data)
        
dataset = load_cv(per_accent = 5)


Loading audio files...
Loading common_voice_en_18779909.mp3...
Loading common_voice_en_18645479.mp3...
Loading common_voice_en_18717362.mp3...
Loading common_voice_en_18754910.mp3...
Loading common_voice_en_18838960.mp3...
Loading common_voice_en_516474.mp3...
Loading common_voice_en_18853644.mp3...
Loading common_voice_en_127920.mp3...
Loading common_voice_en_18403277.mp3...
Loading common_voice_en_208920.mp3...
Loading common_voice_en_609959.mp3...
Loading common_voice_en_609955.mp3...
Loading common_voice_en_609949.mp3...
Loading common_voice_en_609943.mp3...
Loading common_voice_en_609946.mp3...
Loading common_voice_en_18852367.mp3...
Loading common_voice_en_18717941.mp3...
Loading common_voice_en_18720025.mp3...
Loading common_voice_en_18411453.mp3...
Loading common_voice_en_18488699.mp3...
Loading common_voice_en_18672396.mp3...
Loading common_voice_en_18834942.mp3...
Loading common_voice_en_18482654.mp3...
Loading common_voice_en_17719008.mp3...
Loading common_voice_en_18910227.

In [19]:
for data in dataset.shuffle().select(range(10)):
    # if data["lang"] != "us":
    #     continue

    print(f'Truth: {data["transcript"]}')
    print(data["filename"])
    # input_values = processor(ds[7]["audio"]["array"], return_tensors="pt").input_values
    input_values = processor(data["signal"], sampling_rate=16000, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    print(transcription)
    print("\n")
    # print(f"Truth: {ds[7]['text']}")



Truth: Many of the stalls have passed from one generation to the other.
/export/common/data/corpora/ASR/commonvoice/en/clips/common_voice_en_18717362.mp3


['m ɛ n i ʌ v ð ə s t ɑː l z h ɛ v p æ s t f ʌ m w ʌ n dʒ ɛ n ɚ ɹ eɪ ʃ ə n t uː ð i ʌ ð ɚ']


Truth: However, research shows that this crop has the potential to increase in yield.
/export/common/data/corpora/ASR/commonvoice/en/clips/common_voice_en_18713075.mp3
['h aʊ ɛ v ɚ ɹ ɪ s ɚ tʃ ʃ oʊ z ð ɐ t ð ɪ s k ɹ ɑː p h ɪ z ð ə p ə t ɛ n ʃ ə l t ʊ ɪ n k ɹ iː s ɪ n j iː l d']


Truth: An old man plays guitar in front of a red bucket filled with money.
/export/common/data/corpora/ASR/commonvoice/en/clips/common_voice_en_609946.mp3
['ɐ n oʊ l d d m æ n p l eɪ z ɡ ɪ t ɑː ɪ n f ɚ n t ʌ v ɐ ɹ ɛ d b æ k ə t f ɪ l d w ɪ ð m ʌ n i']


Truth: Even the women knew how to be silent.
/export/common/data/corpora/ASR/commonvoice/en/clips/common_voice_en_17719008.mp3
['iː v ə n ð ə w ʊ m ə n n uː h aʊ t ʊ b iː s aɪ l ə n t']


Truth: "He also appears in the second O'Keefe family novel, ""Dragons in the Waters""."
/export/common/data/corpora/ASR/commonvoice/en/clips/common_voice_en_18853644.mp3
['h iː ɐ p ɪ z

In [20]:
# signal, sr = torchaudio.load("/export/common/data/corpora/ASR/commonvoice/en/clips/common_voice_en_18645667.mp3")
signal, sr = torchaudio.load("/exp/nbafna/data/edacc/edacc_v1.0/data/EDACC-C22.wav")
# EDACC-C22.wav

In [21]:
sr

32000

In [24]:
import os, sys
sys.path.append("/home/hltcoe/nbafna/projects/mitigating-accent-bias-in-lid/utils/dataloading")
from vl107 import load_vl107


Loading audio files for en from /exp/jvillalba/corpora/voxlingua107


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [26]:
vl = load_vl107(per_lang=10, lang="en")

for data in vl.shuffle().select(range(10)):
    # if data["lang"] != "us":
    #     continue

    # print(f'Truth: {data["transcript"]}')
    print(data["audio_file"])
    # input_values = processor(ds[7]["audio"]["array"], return_tensors="pt").input_values
    input_values = processor(data["signal"], sampling_rate=16000, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    print(transcription)
    print("\n")
    # print(f"Truth: {ds[7]['text']}")



/exp/jvillalba/corpora/voxlingua107/en/5Ve0wXtmATI__U__S332---2167.620-2177.410.wav
['ð ə l æ n d h iː ɹ uː l z f ɑː t ə ð ə n oː θ b ʌ t j ɛ t ɪ n h ɚ h ɑː t t ʃ iː n oʊ z ð ə t ʃ']


/exp/jvillalba/corpora/voxlingua107/en/a7Z0SkS8nFs__U__S0---0250.130-0266.530.wav
['aʊ l s oʊ æ z ð ə ɡ ɛ ɹ ɔ k w ɨ ɛ s t ɐ æ n d ɡ ɛ ɹ ɔ m i n z w oː t ɹ æ v əl d ɪ n h ɪ z t aɪ m d ə n ɑː ts i']


/exp/jvillalba/corpora/voxlingua107/en/t4e2CHFMWDQ__U__S0---0394.010-0410.480.wav
['ɪ n m aɪ ɹ ɪ s p ɔ n s ə b ɪ l ɪ t i f oʊ m aɪ n uː j uː z ə k oʊ dʒ ɔ n s m ɪ t oː l ɑː n ɪ t ə d uː ɪ z j uː z m aɪ p ə s ɛ n tʃ ə t s']


/exp/jvillalba/corpora/voxlingua107/en/t4e2CHFMWDQ__U__S0---0394.010-0410.480.wav
['s aɪ n w ɪ tʃ ɪ z ə ɡ eɪ n j uː z f oː m aɪ w aɪ l m aɪ s æ tʃ a n d a k æ n t aɪ p ɪ n']


/exp/jvillalba/corpora/voxlingua107/en/OszbTEVdxjw__U__S11---0112.270-0124.620.wav
['ɐ ɹ ɪ k w ɛ s t k ʌ m ɪ n ɪ n f ɹ ʌ m ð ɪ ɪ n t ɚ n ɛ t ɐ w ɛ b p eɪ dʒ ɹ ɪ k w ɛ s t ə n ɛ f t iː p iː iː m eɪ l iː v ɚ h æ k ɚ z

In [27]:
vl = load_vl107(per_lang=10, lang="hi")

for data in vl.shuffle().select(range(10)):
    # if data["lang"] != "us":
    #     continue

    # print(f'Truth: {data["transcript"]}')
    print(data["audio_file"])
    # input_values = processor(ds[7]["audio"]["array"], return_tensors="pt").input_values
    input_values = processor(data["signal"], sampling_rate=16000, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    print(transcription)
    print("\n")
    # print(f"Truth: {ds[7]['text']}")



Loading audio files for hi from /exp/jvillalba/corpora/voxlingua107


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

/exp/jvillalba/corpora/voxlingua107/hi/zVdpq5lprLs__U__S1---0018.420-0027.420.wav
['t aɪ ʃ u r u a t m i i ɡ b eː z b ɔ l p l e r t e p i r u n h o n e d o h a z a r d e r a m i d a']


/exp/jvillalba/corpora/voxlingua107/hi/jw5LlzcI2hI__U__S0---2406.810-2424.690.wav
['d i a ɕ a z a s a l o r i v u t n a b i l e l o r d i a ɔ r a p n a dʒ a s t i f i k e ʃ ə n d e n e s a k a r e r e a ʃ i r aɪ m w a u l']


/exp/jvillalba/corpora/voxlingua107/hi/yax1Op0EtW4__U__S152---0361.840-0371.960.wav
['a r k a f i l o p u n ɡ i n a a m m a n k e s a l t e a l s k u n m a dʒ a r n d a s k a d e t e dʒ a p k i u n k u a']


/exp/jvillalba/corpora/voxlingua107/hi/K4bzT5z3DmE__U__S1---0123.010-0137.170.wav
['i n ɡ a r a p e l o m e i s r o k e k u l p a s p e l o r s a m i l t e i k e a l a v a n a s a j u ɾ o p i n i s p e s j a z e n s i o r b a l ɡ']


/exp/jvillalba/corpora/voxlingua107/hi/UogodRBLEWM__U__S24---0187.100-0205.680.wav
['t a l u k a s w a m i b e n a e t a a r e n d r e n i s u a l

In [28]:
vl = load_vl107(per_lang=10, lang="yo")

for data in vl.shuffle().select(range(10)):
    # if data["lang"] != "us":
    #     continue

    # print(f'Truth: {data["transcript"]}')
    print(data["audio_file"])
    # input_values = processor(ds[7]["audio"]["array"], return_tensors="pt").input_values
    input_values = processor(data["signal"], sampling_rate=16000, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    print(transcription)
    print("\n")
    # print(f"Truth: {ds[7]['text']}")



Loading audio files for yo from /exp/jvillalba/corpora/voxlingua107


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

/exp/jvillalba/corpora/voxlingua107/yo/O0stkM3BTDc__U__S294---2139.640-2152.290.wav
['a b a t u l i a l e ts i k a a m a b a v a m b e l i m e l i a l e p m i w a a s i l e k o i b e r i w a a b o j o n dʒ u']


/exp/jvillalba/corpora/voxlingua107/yo/s6tJ6OqIAcY__U__S104---1656.070-1667.890.wav
['n t b a a b ə d s ʊ l w a a t uː z e s a l o d a w a m ɛ t a l a n i d w a t r iː m']


/exp/jvillalba/corpora/voxlingua107/yo/equuzFg4XzI__U__S279---2126.830-2146.410.wav
['l a k s a k s iː m t ʊ ɹ ɪ t ʊ l ɪ f t h ɪ m s ɛ l f w ɔ ɔ p w ɪ ð ɐ p i l o a n d h iː d s ɛ d ð ə tʃ a w a d r o s']


/exp/jvillalba/corpora/voxlingua107/yo/sF_aTeSig5A__U__S91---0572.800-0586.570.wav
['m ɑ5 p i5 ou5 ɑ5 j i5 t u5 l ɑ5 onɡ5 m ɑ5 j u5 ɑ5']


/exp/jvillalba/corpora/voxlingua107/yo/equuzFg4XzI__U__S279---2126.830-2146.410.wav
['d ə l iː d ts ɛ z ð ə l iː d p o z i ʃ ə n b i k ɔ s j y aɪ ɛ n tʃ a dʒ j uː t ɛ l d e m a n t u t u s l aɪ f']


/exp/jvillalba/corpora/voxlingua107/yo/yERqavpZHnM__U__S184---1138.3

In [29]:
vl = load_vl107(per_lang=10, lang="ka")

for data in vl.shuffle().select(range(10)):
    # if data["lang"] != "us":
    #     continue

    # print(f'Truth: {data["transcript"]}')
    print(data["audio_file"])
    # input_values = processor(ds[7]["audio"]["array"], return_tensors="pt").input_values
    input_values = processor(data["signal"], sampling_rate=16000, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    print(transcription)
    print("\n")
    # print(f"Truth: {ds[7]['text']}")



Loading audio files for ka from /exp/jvillalba/corpora/voxlingua107


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

/exp/jvillalba/corpora/voxlingua107/ka/nrsc8Zihnug__U__S129---0653.870-0665.950.wav
['o s a p x u l ʃ iː a m iː r o m t e l i d i ɡ i ts u l e b e b iː ə n e m o k a l a k i t a m u l a ts i l o b i s t w a r s a s r e s i t']


/exp/jvillalba/corpora/voxlingua107/ka/GrVb_8Jb2W4__U__S15---0030.900-0043.780.wav
['a r s e v o s p o l o d w a r a d e b i d e s a r m u d e n e v i m i ʃ e s a x e s i n a m b y l e ʃ i r a s t a n ʃ eː z d e b o k o n d e s a k']


/exp/jvillalba/corpora/voxlingua107/ka/W4OKP3pubPg__U__S15---0122.450-0136.140.wav
['f t ə d a n i ɡ a m o ts t iː l e b ɐ m a l s r aʊ s ə m eː ɡ l e d a oː t k iː ʃ v iː l i t a r m ɔ ɡ iː']


/exp/jvillalba/corpora/voxlingua107/ka/JUMPYYVwyVk__U__S150---1909.450-1921.070.wav
['a ts ɪ s aɪ ɪ s t s t i n ŋ tʃ ɪ m ɪ d ɛ s s i a k a l ɔ ð ɔ n oː h ə m ɛ d a tʃ ɛ m i b ɪ ʁ uː ɹ ɚ ɐ b e d a tʃ e m']


/exp/jvillalba/corpora/voxlingua107/ka/KmuC3w2Mmb0__U__S11---0103.950-0113.560.wav
['p r o k u r a t u r a m ɡ a m u z i s p b k a m 

In [30]:
vl = load_vl107(per_lang=10, lang="hi")

input_values = processor(vl["signal"], sampling_rate=16000, return_tensors="pt").input_values

logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(transcription)
print("\n")
# print(f"Truth: {ds[7]['text']}")



Loading audio files for hi from /exp/jvillalba/corpora/voxlingua107


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

['k aː ʃ a b m i r e s a d b a n e r ɐ h i n e t i ʃ k u m a r p ʌ r i s a v a l h eː aɪ s a k j uː h u r aɪ k u k j', 'aː p n e t a s v i r d e k h iː h aː p a r n a v a d a k i a m u ŋ ɡ e r tʃ e l t e m u ŋ ɡ e r dʒ o p e l e s eː s a m p r d aɪ k i n s a k i aː ɡ m e t h a dʒ a l r aː t h a', 'v a h k a l s a m p r o d aː k m a h ɔ l ɡ a r m eː r i l v i k i z a m i n p a r h a n u m aː n dʒ i k i m u r t i h a t aɪ dʒ aː n i k e v v i r o', 'v o l t oː r t uː k a s e n i k o p o r k a f i ɡ e h r a s e h u a dʒ o p a r ð i s m eɪ i k n eɪ dʒ a ɡ e p ɔ h o tʃ k aɪ', 'r aɪ n ə dʒ uː m i s a r a r aɪ ʃ p a t ə h a l j a p ɛ oː r j uː ə p dʒ oː l ə r k a t a ð ʊ ɔ b ə l a dʒ a b a n dʒ a p aɪ n', 'e t a r b aː v i uː ɡ a ɡ e r v e a n d e k i k a m i o d e l e ɡ iː t o v ɛ k e d e v o r ɡ o b u n d e n i k l i d e v a k o d uː d e e v e ʃ e r p o o dʒ l e', 'k a h iː n t u r ɡ a t n a h o ɡ a j i t o h a m a r a dʒ iː v ə n v a d i t h o d aː ɡ a h a m a r a n a a m a r e n aː', 'm ɡ 

In [1]:
import os, sys
sys.path.append("/home/hltcoe/nbafna/projects/mitigating-accent-bias-in-lid/utils/dataloading")
from edacc import load_edacc


In [4]:
edacc = load_edacc()

Loaded 76 audio files
Recorded 122 speakers
Loaded 13213 segments
Sample: {'signal': array([-2.5101658e-04, -3.3711875e-04, -4.3454114e-04, ...,
       -8.6554268e-05, -2.1143309e-04, -1.7558667e-04], dtype=float32), 'lang': 'en', 'accent': 'scottish', 'audio_file': 'EDACC-C08'}
Accents: {'romanian', 'icelandic', 'pakistani', 'sinhalese', 'israeli', 'polish', 'catalan', 'lithuanian', 'colombian', 'dutch', 'macedonian', 'chilean', 'indian', 'montenegrin', 'scottish', 'mexican', 'indonesian', 'us', 'tagalog', 'russian', 'ecuadorian', 'jamaican', 'egyptian', 'brazilian', 'french', 'shona', 'irish', 'south african', 'ghanian', 'italian', 'vietnamese', 'spanish', 'nigerian', 'korean', 'chinese', 'american', 'filipino', 'japanese', 'bulgarian', 'uk', 'kenyan'}


In [10]:
edacc = edacc.shuffle()
for data in edacc.select(range(10)):
    # if data["lang"] != "us":
    #     continue

    # print(f'Truth: {data["transcript"]}')
    print("/exp/nbafna/data/edacc/edacc_v1.0/data/"+data["audio_file"]+".wav")
    print(data["accent"])
    # input_values = processor(ds[7]["audio"]["array"], return_tensors="pt").input_values
    input_values = processor(data["signal"], sampling_rate=16000, return_tensors="pt").input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    print(transcription)
    print("\n")
    # print(f"Truth: {ds[7]['text']}")



/exp/nbafna/data/edacc/edacc_v1.0/data/EDACC-C31_P1.wav
pakistani


['s ʌ m t aɪ m ð ə ɔ l d ə s t b ɹ ʌ ð ə ʌ w ɛ n h i ɪ z w ɚ k ɪ ŋ w ɪ ð ð ə æ v i t ɔ l j']


/exp/nbafna/data/edacc/edacc_v1.0/data/EDACC-C30.wav
romanian
['n ɔ ð eɪ ɑː s oʊ t uː p ɚ f ɪ k t uː']


/exp/nbafna/data/edacc/edacc_v1.0/data/EDACC-C29_P2.wav
nigerian
['a oː d ɪ z l aɪ ɪ n f ɹ ɔ n t ʌ v ð ɪ h aʊ s s oʊ æ z aɪ w ɔ z k ɔ m ɪ ŋ w iː d f oʊ s f iː d ɪ n oʊ d w ɔ s t']


/exp/nbafna/data/edacc/edacc_v1.0/data/EDACC-C37_P1.wav
spanish
['t uː l ɛ n ð ɪ s s n uː s a ŋ ɛ t l iː s t n ɑː t b aɪ h a r t b ʌ t d ə b ɛ ɾ ɚ']


/exp/nbafna/data/edacc/edacc_v1.0/data/EDACC-C43_P1.wav
south african
['k w ɪ tʃ ɪ z ð ə f aɪ n ɚ l j eɪ ɹ ʌ v h aɪ s k uː l h ɛ m s oʊ']


/exp/nbafna/data/edacc/edacc_v1.0/data/EDACC-C43_P1.wav
south african
['aɪ θ ɪ ŋ k ɪ t w ʌ z ɐ b ɑː l ɪ t w ʌ z ə b ɑː d ʌ ʌ w aɪ t f æ m l i']


/exp/nbafna/data/edacc/edacc_v1.0/data/EDACC-C61.wav
irish
['t ʌ v v ʌ v j ɚ d eɪ l i l aɪ f j uː n oː æ n d ɪ t ɐ ɡ ɛ n ɪ w ʌ z v ɛ ɹ i m ʌ tʃ j uː k eɪ k']


/exp/nbafna/data/edac

In [11]:
from pydub import AudioSegment
from pydub.playback import play

# Load the sound file
sound = AudioSegment.from_wav('path_to_your_file.wav')

# Play the sound
play(sound)

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_file.wav'