In [1]:
import deepspeech
from datasets import load_dataset
import numpy as np
import pandas as pd
import soundfile as sf
import time


In [4]:
PATH_TRANSCRIPTIONS = "transcriptions.csv"
pre_train_file = 'deepspeech-0.9.3-models.pbmm'
model = deepspeech.Model(pre_train_file)

scorer_file_path = 'deepspeech-0.9.3-models.scorer'
model.enableExternalScorer(scorer_file_path)

lm_alpha = 0.75
lm_beta = 1.85
model.setScorerAlphaBeta(lm_alpha, lm_beta)

beam_width = 500
model.setBeamWidth(beam_width)

0

In [5]:
# define function to read in sound file
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch
    
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)

Reusing dataset librispeech_asr (/home/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
Loading cached processed dataset at /home/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc/cache-361ab9f8e626e5d4.arrow


In [14]:

# initialize constants

df_transcriptions = pd.DataFrame()

len_ds = len(ds["text"])
BATCH_SIZE = 1

for beam_width in [100, 250, 500]:
    model.setBeamWidth(beam_width)
    transcriptions = []
    error = []
    inference_time = []
    texts = []
    
    for i in range(len_ds//BATCH_SIZE):

        # import audio
        texts.append(ds['text'][i])
        audio = ds["speech"][i]
        audio = np.array(audio)
        a = audio + np.abs(audio.min())
        a = (a * 1024).astype(np.int16)
        # count time of inference
        start = time.time()

        transcription = model.stt(a)

        # store time of inference
        inference_time.append(time.time() - start)

        # store transcription
        transcriptions.append(transcription)

    all_transcriptions = list(np.array(transcriptions).reshape(-1))
    
    df_transcriptions[pre_train_file+ "_Beam_" +str(beam_width)] = all_transcriptions
    df_transcriptions[pre_train_file+ "_Beam_" +str(beam_width)] = df_transcriptions[pre_train_file+ "_Beam_" +str(beam_width)].apply(lambda x: x.upper())

    df_transcriptions[pre_train_file+ "_Beam_" +str(beam_width) + "_inf_time"] = inference_time
    df_transcriptions['ground_truth'] = texts


In [15]:
# df_transcriptions = pd.read_csv(PATH_TRANSCRIPTIONS, index_col=0)

# df_transcriptions = pd.DataFrame()

# df_transcriptions[pre_train_file+] = all_transcriptions
# df_transcriptions[pre_train_file] = df_transcriptions[pre_train_file].apply(lambda x: x.upper())

# df_transcriptions[pre_train_file + "_inf_time"] = inference_time
# df_transcriptions['ground_truth'] = texts
df_transcriptions

Unnamed: 0,deepspeech-0.9.3-models.pbmm_Beam_100,deepspeech-0.9.3-models.pbmm_Beam_100_inf_time,ground_truth,deepspeech-0.9.3-models.pbmm_Beam_250,deepspeech-0.9.3-models.pbmm_Beam_250_inf_time,deepspeech-0.9.3-models.pbmm_Beam_500,deepspeech-0.9.3-models.pbmm_Beam_500_inf_time
0,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.237833,BECAUSE YOU WERE SLEEPING INSTEAD OF CONQUERIN...,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.233738,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.453737
1,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,6.489106,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,6.472815,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,8.568673
2,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,6.531460,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,6.656788,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,8.940142
3,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,2.736703,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,2.764637,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,4.097715
4,THE KING IS BORNE DISGRACE IN YOUR FRIENDS ARE...,2.399246,THE KING HAS FLED IN DISGRACE AND YOUR FRIENDS...,THE KING IS BORNE DISGRACE IN YOUR FRIENDS ARE...,2.430780,THE KING IS BORNE DISGRACE IN YOUR FRIENDS ARE...,2.985064
...,...,...,...,...,...,...,...
68,A ROCKET MAZED AT THE SUDDEN FURY OF THE ATTAC...,3.839273,IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ...,I ROCKED THE AMAZED AT THE SUDDEN FURY OF THE ...,3.520508,A ROD LOOKED THE AMAZED AT THE SUDDEN FURY OF ...,3.438776
69,HE OTHERWISE LAST BURST OF ENERGY HE KNEW HOW ...,4.199781,HE THOUGHT IT WAS A LAST BURST OF ENERGY HE KN...,THE OTHER WAS THE LAST BURST OF ENERGY HE KNEW...,5.115086,THE OTHER WAS THE LAST BURST OF ENERGY HE KNEW...,4.559263
70,REUNITING CLOSED PANIC ON HIS OPPONENT'S FACE ...,4.557306,BRION SAW SOMETHING CLOSE TO PANIC ON HIS OPPO...,BE UNSOOTHING CLOSED PANIC ON HIS OPPONENT'S F...,4.428173,BE UNSOOTHING CLOSED PANIC ON HIS OPPONENT'S F...,4.496585
71,A WAVE OF DESPAIR ROLLED OUT FROM IRON RINSED ...,4.777880,A WAVE OF DESPAIR ROLLED OUT FROM IROLG BRION ...,A WAVE OF DESPAIR ROLLED OUT FROM IRON RINSED ...,4.740838,A WAVE OF DESPAIR ROLLED OUT FROM IRO RINSED A...,4.690693


In [16]:
df_transcriptions.to_csv(PATH_TRANSCRIPTIONS)

In [7]:
# import numpy as np
# import wave
# filename = 'example.wav'
# w = wave.open(filename, 'r')
# rate = w.getframerate()
# frames = w.getnframes()
# buffer = w.readframes(frames)
# print(rate)
# print(model.sampleRate())
# type(buffer)
# data16 = np.frombuffer(buffer, dtype=np.int16)
# type(data16)
# print(data16.shape)
# text = model.stt(data16)
# print(text)

In [4]:
context = model.createStream()

In [5]:
buffer_len = len(buffer)
offset = 0
batch_size = 16384
text = ''
while offset < buffer_len:
    end_offset = offset + batch_size
    chunk = buffer[offset:end_offset]
    data16 = np.frombuffer(chunk, dtype=np.int16)
    model.feedAudioContent(context, data16)
    text = model.intermediateDecode(context)
    print(text)
    offset = end_offset

NameError: name 'buffer' is not defined

In [6]:
text = model.finishStream(context)
print(text)

AttributeError: 'Model' object has no attribute 'finishStream'

In [7]:
import pyaudio
text_so_far = ''
def process_audio(in_data, frame_count, time_info, status):
    global text_so_far
    data16 = np.frombuffer(in_data, dtype=np.int16)
    model.feedAudioContent(context, data16)
    text = model.intermediateDecode(context)
    if text != text_so_far:
        print('Interim text = {}'.format(text))
        text_so_far = text
    return (in_data, pyaudio.paContinue)

In [8]:
audio = pyaudio.PyAudio()
stream = audio.open(
    format=pyaudio.paInt16,
    channels=1,
    rate=16000,
    input=True,
    frames_per_buffer=1024,
    stream_callback=process_audio
)
print('Please start speaking, when done press Ctrl-C ...')
stream.start_stream()

Please start speaking, when done press Ctrl-C ...


In [None]:
try: 
    while stream.is_active():
        time.sleep(0.1)
except KeyboardInterrupt:
    # PyAudio
    stream.stop_stream()
    stream.close()
    audio.terminate()
    print('Finished recording.')
    # DeepSpeech
    text = model.finishStream(context)
    print('Final text = {}'.format(text))