In [1]:
import deepspeech
from datasets import load_dataset
import numpy as np
import pandas as pd
import soundfile as sf
import time


In [4]:
PATH_TRANSCRIPTIONS = "transcriptions.csv"
pre_train_file = 'deepspeech-0.9.3-models.pbmm'
model = deepspeech.Model(pre_train_file)

scorer_file_path = 'deepspeech-0.9.3-models.scorer'
model.enableExternalScorer(scorer_file_path)

lm_alpha = 0.75
lm_beta = 1.85
model.setScorerAlphaBeta(lm_alpha, lm_beta)

beam_width = 500
model.setBeamWidth(beam_width)

0

In [5]:
# define function to read in sound file
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch
    
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)

Reusing dataset librispeech_asr (/home/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)
Loading cached processed dataset at /home/lucasagrizzi/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc/cache-361ab9f8e626e5d4.arrow


In [17]:

# initialize constants

df_transcriptions = pd.DataFrame()

len_ds = len(ds["text"])
BATCH_SIZE = 1
beam_search = [10, 25, 50, 100, 200, 500]

for beam_width in beam_search:
    model.setBeamWidth(beam_width)
    transcriptions = []
    error = []
    inference_time = []
    texts = []
    
    for i in range(len_ds//BATCH_SIZE):

        # import audio
        texts.append(ds['text'][i])
        audio = ds["speech"][i]
        audio = np.array(audio)
        a = audio + np.abs(audio.min())
        a = (a * 1024).astype(np.int16)
        # count time of inference
        start = time.time()

        transcription = model.stt(a)

        # store time of inference
        inference_time.append(time.time() - start)

        # store transcription
        transcriptions.append(transcription)

    all_transcriptions = list(np.array(transcriptions).reshape(-1))
    
    df_transcriptions[pre_train_file+ "_Beam_" +str(beam_width)] = all_transcriptions
    df_transcriptions[pre_train_file+ "_Beam_" +str(beam_width)] = df_transcriptions[pre_train_file+ "_Beam_" +str(beam_width)].apply(lambda x: x.upper())

    df_transcriptions[pre_train_file+ "_Beam_" +str(beam_width) + "_inf_time"] = inference_time
    df_transcriptions['ground_truth'] = texts


In [18]:
# df_transcriptions = pd.read_csv(PATH_TRANSCRIPTIONS, index_col=0)

# df_transcriptions = pd.DataFrame()

# df_transcriptions[pre_train_file+] = all_transcriptions
# df_transcriptions[pre_train_file] = df_transcriptions[pre_train_file].apply(lambda x: x.upper())

# df_transcriptions[pre_train_file + "_inf_time"] = inference_time
# df_transcriptions['ground_truth'] = texts
df_transcriptions

Unnamed: 0,deepspeech-0.9.3-models.pbmm_Beam_10,deepspeech-0.9.3-models.pbmm_Beam_10_inf_time,ground_truth,deepspeech-0.9.3-models.pbmm_Beam_25,deepspeech-0.9.3-models.pbmm_Beam_25_inf_time,deepspeech-0.9.3-models.pbmm_Beam_50,deepspeech-0.9.3-models.pbmm_Beam_50_inf_time,deepspeech-0.9.3-models.pbmm_Beam_100,deepspeech-0.9.3-models.pbmm_Beam_100_inf_time,deepspeech-0.9.3-models.pbmm_Beam_200,deepspeech-0.9.3-models.pbmm_Beam_200_inf_time,deepspeech-0.9.3-models.pbmm_Beam_500,deepspeech-0.9.3-models.pbmm_Beam_500_inf_time
0,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.448100,BECAUSE YOU WERE SLEEPING INSTEAD OF CONQUERIN...,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.069856,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.122874,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.152277,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.761018,BECAUSE YOU ARE A SLEEPING IN SOME OF CONQUERI...,6.529678
1,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,6.320176,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,6.286423,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,6.339274,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,6.365232,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,6.418505,HE HAS GONE AND GONE FOR GOOD ANSWERED POLYCHR...,6.707272
2,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,6.492348,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,6.436562,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,6.475768,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,6.509874,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,6.625520,I HAVE REMAINED A PRISONER ONLY BECAUSE I WISH...,6.861959
3,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,2.689316,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,2.691141,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,2.668041,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,2.725827,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,2.768673,THE LITTLE GIRL HAD BEEN ASLEEP BUT SHE HEARD ...,2.878325
4,THE KING IS FORNICATING YOUR FRIENDS ARE ASKIN...,2.379258,THE KING HAS FLED IN DISGRACE AND YOUR FRIENDS...,THE KING IS FOND DISGRACE IN YOUR FRIENDS ARE ...,2.388801,THE KING IS FOND DISGRACE IN YOUR FRIENDS ARE ...,2.379519,THE KING IS BORNE DISGRACE IN YOUR FRIENDS ARE...,2.410946,THE KING IS BORNE DISGRACE IN YOUR FRIENDS ARE...,2.425442,THE KING IS BORNE DISGRACE IN YOUR FRIENDS ARE...,2.542583
...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,I ROLLICKED AT THE SUDDEN FURY OF THE ATTACK A...,3.338056,IROLG LOOKED AMAZED AT THE SUDDEN FURY OF THE ...,I ROLLICKED AT THE SUDDEN FURY OF THE ATTACK A...,3.200698,I ROCKETMAN AT THE SUDDEN FURY OF THE ATTACK A...,3.223782,A ROCKET MAZED AT THE SUDDEN FURY OF THE ATTAC...,3.403788,I ROCKED THE AMAZED AT THE SUDDEN FURY OF THE ...,3.303399,A ROD LOOKED THE AMAZED AT THE SUDDEN FURY OF ...,3.494637
69,HE ANTHITHESIS BURST OF ENERGY HE KNEW HOW CLO...,3.890311,HE THOUGHT IT WAS A LAST BURST OF ENERGY HE KN...,HE ANTITHESES BURST OF ENERGY HE KNEW HOW CLOS...,3.888568,HE OTHERWISE LAST BURST OF ENERGY HE KNEW HOW ...,3.897436,HE OTHERWISE LAST BURST OF ENERGY HE KNEW HOW ...,4.077797,THE OTHER WAS THE LAST BURST OF ENERGY HE KNEW...,4.106094,THE OTHER WAS THE LAST BURST OF ENERGY HE KNEW...,4.444025
70,BREUNING CLOSED PANIC ON HIS OPPONENT'S FACE W...,4.043184,BRION SAW SOMETHING CLOSE TO PANIC ON HIS OPPO...,BREUNING CLOSED PANIC ON HIS OPPONENT'S FACE W...,4.029787,BREUNING CLOSED PANIC ON HIS OPPONENT'S FACE W...,4.087415,REUNITING CLOSED PANIC ON HIS OPPONENT'S FACE ...,4.553316,BE UNASSUMING CLOSED PANIC ON HIS OPPONENT'S F...,4.153613,BE UNSOOTHING CLOSED PANIC ON HIS OPPONENT'S F...,4.477473
71,A WAVE OF THE PAROLED FROM IRELAND SENSATIONIS...,4.277683,A WAVE OF DESPAIR ROLLED OUT FROM IROLG BRION ...,A WAVE OF DESPAIR ROLLED OUT FROM IRELAND SEST...,4.054312,A WAVE OF DESPAIR ROLLED OUT FROM IRELAND SENS...,4.060201,A WAVE OF DESPAIR ROLLED OUT FROM IRON RINSED ...,4.698914,A WAVE OF DESPAIR ROLLED OUT FROM IRON RINSED ...,4.219502,A WAVE OF DESPAIR ROLLED OUT FROM IRO RINSED A...,4.621700


In [19]:
df_transcriptions.to_csv(PATH_TRANSCRIPTIONS)

In [7]:
# import numpy as np
# import wave
# filename = 'example.wav'
# w = wave.open(filename, 'r')
# rate = w.getframerate()
# frames = w.getnframes()
# buffer = w.readframes(frames)
# print(rate)
# print(model.sampleRate())
# type(buffer)
# data16 = np.frombuffer(buffer, dtype=np.int16)
# type(data16)
# print(data16.shape)
# text = model.stt(data16)
# print(text)

In [4]:
context = model.createStream()

In [5]:
buffer_len = len(buffer)
offset = 0
batch_size = 16384
text = ''
while offset < buffer_len:
    end_offset = offset + batch_size
    chunk = buffer[offset:end_offset]
    data16 = np.frombuffer(chunk, dtype=np.int16)
    model.feedAudioContent(context, data16)
    text = model.intermediateDecode(context)
    print(text)
    offset = end_offset

NameError: name 'buffer' is not defined

In [6]:
text = model.finishStream(context)
print(text)

AttributeError: 'Model' object has no attribute 'finishStream'

In [7]:
import pyaudio
text_so_far = ''
def process_audio(in_data, frame_count, time_info, status):
    global text_so_far
    data16 = np.frombuffer(in_data, dtype=np.int16)
    model.feedAudioContent(context, data16)
    text = model.intermediateDecode(context)
    if text != text_so_far:
        print('Interim text = {}'.format(text))
        text_so_far = text
    return (in_data, pyaudio.paContinue)

In [8]:
audio = pyaudio.PyAudio()
stream = audio.open(
    format=pyaudio.paInt16,
    channels=1,
    rate=16000,
    input=True,
    frames_per_buffer=1024,
    stream_callback=process_audio
)
print('Please start speaking, when done press Ctrl-C ...')
stream.start_stream()

Please start speaking, when done press Ctrl-C ...


In [None]:
try: 
    while stream.is_active():
        time.sleep(0.1)
except KeyboardInterrupt:
    # PyAudio
    stream.stop_stream()
    stream.close()
    audio.terminate()
    print('Finished recording.')
    # DeepSpeech
    text = model.finishStream(context)
    print('Final text = {}'.format(text))