In [81]:
from os import environ, path, walk
from pocketsphinx import *
from sphinxbase import *
import fnmatch
import json
from jiwer import wer

relative_path = "../ps_data"
pwd = os.getcwd()
pwd = pwd[:len(pwd)-3]

In [83]:
def find_files(filename, search_path):
   result = []

   # Wlaking top-down from the root
   for root, dir, files in os.walk(search_path):
      for file in files:
         if file.__contains__(".raw"):        
            result.append(os.path.join(root, file))
   return result


def create_decoder_digits(dic, grammar, rule):
    """Create a decoder based on the goforward custom grammar"""
    config = Decoder.default_config()
    config.set_string('-hmm', pwd +'ps_data/model/en-us')  # acoustic model
    config.set_string('-dict', pwd +'ps_data/lex/' + dic)  # lexicon / dictionary
    decoder_digit = Decoder(config)

    # Now we use a custom language model
    # Prepare the grammar to be used
    jsgf = Jsgf(pwd +'ps_data/jsgf/' + grammar)  # load the grammar file
    rule = jsgf.get_rule('digits.' + rule)  # choose the rule
    fsg = jsgf.build_fsg(rule, decoder_digit.get_logmath(), 7.5)  # build the grammar rule
    fsg.writefile('../outputs/' + grammar)  # write the compiled grammar rule as an external file

    # Now set the fsg grammar rule in the decoder
    decoder_digit.set_fsg("../outputs/" + grammar, fsg)  # load the pre-recorded compiled grammar rule in the decoder
    decoder_digit.set_search("../outputs/" + grammar)  # and set it as the grammar to use

    return decoder_digit

In [98]:
def rundecoder(file_path, decoder):
    # Start the decoder
    decoder.start_utt()

    # Open the file to decode
    stream = open(file_path, 'rb')
    uttbuf = stream.read(-1)

    # Process the file with the decoder
    if uttbuf:
        decoder.process_raw(uttbuf, False, True)
    else:
        print("Error reading speech data")
        exit()
    decoder.end_utt()

    # test for empty hypothesis and replace the output with an empty string if needed
    if decoder.hyp() is None:
        best_hypothesis = ''
    else:
        best_hypothesis = decoder.hyp().hypstr

    #reading ref file for WER
    with open(file_path[:len(file_path)-3] + "ref") as f:
        actualY = f.readlines()
    
    wordErrorRate = wer(actualY, best_hypothesis)

    return {
    "fileName": file_path.split("/")[-1],
    "prediction": best_hypothesis,
    "actual": actualY[0].replace("\n", ""), 
    "confidence": decoder.get_logmath().exp(decoder.hyp().prob),
    "wer" : wordErrorRate}

In [86]:
# Define the path of the file to process
file_path = pwd +'ps_data/example/ex_digits.raw'

# Instantiate the decoder
# decoder = create_decoder_ngram()  # use the N-gram language model
decoder = create_decoder_digits("digits.dict", "digits.jsgf", "rule3")
rundecoder(file_path, decoder)

INFO: pocketsphinx.c(151): Parsed model-specific feature parameters from /home/shahzaib/Documents/IDMC/Speech Processing/lab_material/ps_data/model/en-us/feat.params
Current configuration:
[NAME]			[DEFLT]		[VALUE]
-agc			none		none
-agcthresh		2.0		2.000000e+00
-allphone				
-allphone_ci		yes		yes
-alpha			0.97		9.700000e-01
-ascale			20.0		2.000000e+01
-aw			1		1
-backtrace		no		no
-beam			1e-48		1.000000e-48
-bestpath		yes		yes
-bestpathlw		9.5		9.500000e+00
-ceplen			13		13
-cmn			live		batch
-cmninit		40,3,-1		41.00,-5.29,-0.12,5.09,2.48,-4.07,-1.37,-1.78,-5.08,-2.05,-6.45,-1.42,1.17
-compallsen		no		no
-dict					/home/shahzaib/Documents/IDMC/Speech Processing/lab_material/ps_data/lex/digits.dict
-dictcase		no		no
-dither			no		no
-doublebw		no		no
-ds			1		1
-fdict					
-feat			1s_c_d_dd	1s_c_d_dd
-featparams				
-fillprob		1e-8		1.000000e-08
-frate			100		100
-fsg					
-fsgusealtpron		yes		yes
-fsgusefiller		yes		yes
-fwdflat		yes		yes
-fwdflatbeam		1e-64		1.000000e-64
-fwdflate

{'fileName': 'ex_digits.raw',
 'prediction': 'oh one eight',
 'actual': 'oh one eight',
 'confidance': 1.0,
 'wer': 0.0}

In [100]:
folder_path = pwd + "td_corpus_digits/SNR35dB/man/seq3digits_100_files/"
wavFiles = find_files(".raw",folder_path)

decoder = create_decoder_digits("digits.dict", "digits.jsgf", "rule3")
results = []
for wav in wavFiles:
    results.append(rundecoder(wav, decoder))
with open('results.json', 'w') as f:
    json.dump(results, f)

overallWER = 0
for i in results:
    overallWER += i["wer"]
#do it with different Settings:
#Rules = [rulen, rule1 | rule3 | rule5]
#Compare WER of all


INFO: pocketsphinx.c(151): Parsed model-specific feature parameters from /home/shahzaib/Documents/IDMC/Speech Processing/lab_material/ps_data/model/en-us/feat.params
Current configuration:
[NAME]			[DEFLT]		[VALUE]
-agc			none		none
-agcthresh		2.0		2.000000e+00
-allphone				
-allphone_ci		yes		yes
-alpha			0.97		9.700000e-01
-ascale			20.0		2.000000e+01
-aw			1		1
-backtrace		no		no
-beam			1e-48		1.000000e-48
-bestpath		yes		yes
-bestpathlw		9.5		9.500000e+00
-ceplen			13		13
-cmn			live		batch
-cmninit		40,3,-1		41.00,-5.29,-0.12,5.09,2.48,-4.07,-1.37,-1.78,-5.08,-2.05,-6.45,-1.42,1.17
-compallsen		no		no
-dict					/home/shahzaib/Documents/IDMC/Speech Processing/lab_material/ps_data/lex/digits.dict
-dictcase		no		no
-dither			no		no
-doublebw		no		no
-ds			1		1
-fdict					
-feat			1s_c_d_dd	1s_c_d_dd
-featparams				
-fillprob		1e-8		1.000000e-08
-frate			100		100
-fsg					
-fsgusealtpron		yes		yes
-fsgusefiller		yes		yes
-fwdflat		yes		yes
-fwdflatbeam		1e-64		1.000000e-64
-fwdflate

In [104]:
print(overallWER/len(results))

for i in results:
    if(i["confidence"] < 1):
        print(i)

0.019999999999999997
