In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

### Indexer

In [2]:
def read_morph_dict(filepath):
    with open (filepath, "r") as myfile:
        data=myfile.readlines()
    morph_dict = {}
    for line in data:
        line_split= line.split()
        morph_dict[line_split[0].lower()]=line_split[1:]
    return morph_dict

In [3]:
def create_index(filepath, average=False, morph=False):
    if (morph==True):
        index_morph = read_morph_dict("../lib/dicts/morph.dct")
        
    with open (filepath, "r") as myfile:
        data=myfile.readlines()

    #create file dictionary to contain data entries for each audio file
    file_dict = {}
    for line in data:
        filename, channel, startTime, duration, token, score_str=line.split()
        score = float(score_str)
        token = token.lower()
        
        #if morph=true, replace by morphological decomposition
        token_list=[]
        if (morph==True and token in index_morph):
            morph_token_list = index_morph[token]
            word_dur = float(duration)/len(morph_token_list)
            #if averaging, score remains same.
            #if multiplying, decomposition score = score^1/n, where n = number of morphemes.
            if (not average):
                score = score ** (1.0/len(morph_token_list))
            for n, morph_token in enumerate(morph_token_list):
                token_list.append([morph_token, int(channel), float(startTime) + (n * word_dur), word_dur, float(score)])
        else:
            token_list = [[token, int(channel), float(startTime), float(duration), float(score)]]
            
            
        if filename not in file_dict:
            file_dict[filename] = []
        file_dict[filename] = file_dict[filename] + token_list
        #print file_dict[filename]

    #index stores words and phrases as keys
    indexed_dict = {}
    for audio_file in file_dict:
        t1=0
        prev_word_end = 0
        phrase_list = []
        for word in file_dict[audio_file]:

            #new word start time
            new_word_start = word[2]

            #if new word start time < previous word end time + 0.5, add current word to all phrases, else clear list
            if (new_word_start < prev_word_end + 0.5):
                for phrase in phrase_list:
                    phrase[0] = "{} {}".format(phrase[0],word[0])
                    
                    if (average):
                        n_words = len(phrase[0].split())
                        phrase[4] = (word[4] + phrase[4]*(n_words-1))/n_words#weighted average of scores
                    else:
                        phrase[4] = phrase[4] * word[4] #multiply probabilities
                        
                    #phrase[4] = max(phrase[4],word[4])#maximum score
                    #phrase[4] = min(1.0,phrase[4]+word[4])#sum and clip
                    
                    phrase[3] = round(word[2] - phrase[2] + word[3],5)#phrase duration = word start time - phrase start time + word duration
                    #print (phrase[3], word[2], phrase[2], word[3])
            else:
                phrase_list = []

            #add current word to list
            phrase_list.append(word)

            #add all phrases to indexed_dict
            for phrase in phrase_list:
                if phrase[0] not in indexed_dict:
                    indexed_dict[phrase[0]] = [[audio_file] + phrase[1:5]]
                else:
                    indexed_dict[phrase[0]].append([audio_file] + phrase[1:5])

            #store current word end time for next comparison
            prev_word_end = word[2] + word[3]
            
            #check, words are sorted by timeStart
            t2 = word[2]
            #if t2<t1:
            #    print prev_audio_file==audio_file
            #    print t1_s, word
            #    print "WARNING: WORDS NOT IN CHRONOLOGICAL ORDER"
            t1 = word[2]
            t1_s = word
            prev_audio_file = audio_file
    return indexed_dict

### XML Parser for Query XML

In [4]:
class query:
    def __init__(self, kwid='', text='', orig_text=''):
        self.kwid=kwid
        self.text=text
        self.orig_text=orig_text
        self.oov=0
        self.search_time=0.0
        self.res_list=[]
        self.system_count = 0
        self.normalized=False
        
        self.T=36000
        self.beta=999.9
    
    def print_variables(self):
        return (self.kwid, self.orig_text, self.oov, self.search_time, self.res_list)
        
    def print_output(self):
        output_xml = []
        output_xml.append("<detected_kwlist kwid=\"{}\" oov_count=\"{}\" search_time=\"{:.10f}\">".format(self.kwid, self.oov, self.search_time))
        for res in self.res_list:
            output_xml.append("<kw file=\"{}\" channel=\"{}\" tbeg=\"{:0.2f}\" dur=\"{:0.3f}\" score=\"{:0.6f}\" decision=\"YES\"/>".format(res[0], res[1], res[2], res[3], res[4]))
        output_xml.append("</detected_kwlist>") 
        return output_xml
    
    def normalize(self, gamma):#ex4
        if(self.normalized):
            print("Already Normalized")
        self.gamma = gamma
        sum_denom=0
        for res in self.res_list:
            sum_denom+=res[4]**self.gamma
        for res in self.res_list:
            res[4] = (res[4]**gamma)/sum_denom
        self.normalized=True
        
    def kst(self):
        N_true = len(self.res_list)
        threshold = self.beta * N_true / (self.T + (self.beta-1) * N_true)
        new_list = []
        for res in self.res_list:
            #print (res[4],self.threshold, res[4]>self.threshold)
            if (res[4]>threshold):
                new_list.append(res)
        self.res_list = new_list
    
    def kst2(self):
        total_score = 0.0
        alpha=1.5
        for res in self.res_list:
            total_score += res[4]
        threshold = self.beta * total_score * alpha / (self.T + (self.beta-1) * total_score * alpha)
        new_list = []
        for res in self.res_list:
            #print (res[4],self.threshold, res[4]>self.threshold)
            if (res[4]>threshold):
                new_list.append(res)
        #print total_score, threshold
        self.res_list = new_list

In [5]:
import xml.etree.ElementTree as ET
def get_queries(filepath, morph=False):
    tree = ET.parse(filepath)
    root = tree.getroot()
    search_list = []
    if (morph==True):
        query_morph = read_morph_dict("../lib/dicts/morph.kwslist.dct")
        for child in root:
            ctext_items = child[0].text.split()
            ctext_morph = []
            for ctext in ctext_items:
                if ctext in query_morph:
                    ctext_morph.append(" ".join(query_morph[ctext]))
                else:
                    ctext_morph.append(ctext)
            search_list.append(query(child.attrib['kwid'], " ".join(ctext_morph), child[0].text))
    else:
        for child in root:
            search_list.append(query(child.attrib['kwid'], child[0].text, child[0].text))
    return search_list

In [6]:
def kst_list(search_list):
    new_list = []
    for q in search_list:
        #print q.print_variables()
        q.kst()
        new_list.append(q)
    return new_list

def kst2_list(search_list):
    new_list = []
    for q in search_list:
        #print q.print_variables()
        q.kst2()
        new_list.append(q)
    return new_list

In [7]:
def normalize_list(search_list, gamma=1):
    new_list = []
    for q in search_list:
        #print q.print_variables()
        new_list.append(q.normalize(gamma))
    return new_list

### Searcher

In [8]:
from timeit import default_timer as timer  

def search_index(search_list, indexed_dict, gamma=0): 
    oov_count=0
    for q in search_list:
        start = timer()
        if q.text in indexed_dict:
            q.res_list = indexed_dict[q.text]
        else:
            oov_count+=1
        end = timer()
        q.oov = 1
        if (gamma>0):
            q.normalize(gamma)
        q.search_time = end - start
    print "oov_count={}".format(oov_count)
    return search_list

def gen_output(search_results, output_file):
    kw_top = "<kwslist kwlist_filename=\"IARPA-babel202b-v1.0d_conv-dev.kwlist.xml\" language=\"swahili\" system_id=\"\">"
    kw_end = "</kwslist>"
    output_xml = []
    output_xml.append(kw_top)
    for search in search_results:
        output_xml = output_xml + search.print_output()
    output_xml.append(kw_end)
    with open(output_file, "w") as text_file:
        text_file.write("\n".join(output_xml))
    return output_xml

#### Easier Scoring Interface Function

In [9]:
import subprocess

def get_scores(res_name, print_latex=False):
    
    subprocess.call(["rm", "-r", "../scoring/{}/".format(res_name)])
    subprocess.call(["../scripts/score.sh", "../output/{}.xml".format(res_name), "../scoring"])
    results = {}
    
    for res_type in ["all", "iv", "oov"]:
        p = subprocess.Popen(["../scripts/termselect.sh", "../lib/terms/ivoov.map", "../output/{}.xml".format(res_name), "../scoring", res_type], stdout=subprocess.PIPE)
        output_all, err = p.communicate()
        results[res_type] = (float(output_all.split(" ")[1].split("=")[1]), float(output_all.split(" ")[2].split("=")[1]), int(output_all.split(" ")[3].split("=")[1].replace("\n","")))
        if (not print_latex):
            print ("{} ({}) - TWV:{:1.4f}".format(res_type, results[res_type][2], results[res_type][0]))
    for res_type in ["short", "long"]:
        p = subprocess.Popen(["../scripts/termselect.sh", "../lib/terms/longshort.map", "../output/{}.xml".format(res_name), "../scoring", res_type], stdout=subprocess.PIPE)
        output_all, err = p.communicate()
        results[res_type] = (float(output_all.split(" ")[1].split("=")[1]), float(output_all.split(" ")[2].split("=")[1]), int(output_all.split(" ")[3].split("=")[1].replace("\n","")))
        if (not print_latex):
            print ("{} ({}) - TWV:{:1.4f}".format(res_type, results[res_type][2], results[res_type][0]))
    for res_type in ["word", "phrase"]:
        p = subprocess.Popen(["../scripts/termselect.sh", "../lib/terms/phraseword.map", "../output/{}.xml".format(res_name), "../scoring", res_type], stdout=subprocess.PIPE)
        output_all, err = p.communicate()
        results[res_type] = (float(output_all.split(" ")[1].split("=")[1]), float(output_all.split(" ")[2].split("=")[1]), int(output_all.split(" ")[3].split("=")[1].replace("\n","")))
        if (not print_latex):
            print ("{} ({}) - TWV:{:1.4f}".format(res_type, results[res_type][2], results[res_type][0]))
        
    p=subprocess.Popen(["grep", "Summary", "../scoring/{}/Full-Occur-MITLLFA3-AppenWordSeg.bsum.txt".format(res_name)], stdout= subprocess.PIPE)
    if (not print_latex):
        print ("Threshold: {}".format(results['all'][1]))
    
    output, err = p.communicate()
    Targ = int(output.split("|")[4])
    Corr = int(output.split("|")[5])
    FA = int(output.split("|")[6])
    Miss = int(output.split("|")[7])
    results["targets"]=Targ
    results["correct"]=Corr
    results["false_alarm"]=FA
    results["missed"]=Miss
    if (not print_latex):
        print ("Targets: {} Correct: {} False Alarms: {} Miss: {}".format(Targ,Corr,FA,Miss))
    
    #latex printing
    if (print_latex):
        print ("{:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f}\\\\\hline".format(results['all'][0],results['iv'][0],results['oov'][0],results['short'][0],results['long'][0],results['phrase'][0],results['word'][0],results['all'][1]))
        print ("{} & {} & {} & {}\\\\\hline".format(Targ,Corr,FA,Miss))
    
    return results

#### Generating Length Map - >10 char = long

In [10]:
query_list = get_queries("../lib/kws/queries.xml")
phrase_ct = 1
word_ct = 1
print_list = []
for q in query_list:
    if (" " in q.text):
        print_list.append("phrase {} {:04d}".format(q.kwid[6:],phrase_ct))
        phrase_ct+=1
    else:
        print_list.append("word {} {:04d}".format(q.kwid[6:],word_ct))
        word_ct+=1
#print "\n".join(print_list)

In [11]:
query_list = get_queries("../lib/kws/queries.xml")
long_ct = 1
short_ct = 1
print_list = []
for q in query_list:
    if (len(q.text)>10):
        print_list.append("long {} {:04d}".format(q.kwid[6:],long_ct))
        long_ct+=1
    else:
        print_list.append("short {} {:04d}".format(q.kwid[6:],short_ct))
        short_ct+=1
#print "\n".join(print_list)

## Ex1

#### Reference KWS

In [12]:
start = timer()
kw_idx_ref = create_index("../lib/ctms/reference.ctm", False, False)
end = timer()
print("Index Entries: {} | Index Build Time: {}".format(len(kw_idx_ref.keys()),end - start))

query_list = get_queries("../lib/kws/queries.xml")

start = timer()
reference_results = search_index(query_list, kw_idx_ref)
end = timer()

_ = gen_output(reference_results, "../output/reference.xml")
_ = get_scores("reference")
print ("Time Taken: {}".format(end-start))

Index Entries: 315726 | Index Build Time: 4.66487288475
oov_count=8
all (488) - TWV:1.0000
iv (388) - TWV:1.0000
oov (100) - TWV:1.0000
short (321) - TWV:1.0000
long (167) - TWV:1.0000
word (288) - TWV:1.0000
phrase (200) - TWV:1.0000
Threshold: 1.0
Targets: 963 Correct: 963 False Alarms: 0 Miss: 0
Time Taken: 0.000944852828979


## Ex2

#### Word-based KWS

In [25]:
start = timer()
kw_idx_word = create_index("../lib/ctms/decode.ctm", False, False)
end = timer()
print("Index Entries: {} | Index Build Time: {}".format(len(kw_idx_word.keys()),end - start))

query_list = get_queries("../lib/kws/queries.xml")

start = timer()
word_results = search_index(query_list, kw_idx_word)
end = timer()

_ = gen_output(word_results, "../output/decode.xml")
_ = get_scores("decode")
print ("Time Taken: {}".format(end-start))

Index Entries: 263707 | Index Build Time: 2.51817297935
oov_count=246
all (488) - TWV:0.3189
iv (388) - TWV:0.4011
oov (100) - TWV:0.0000
short (321) - TWV:0.3123
long (167) - TWV:0.3317
word (288) - TWV:0.3221
phrase (200) - TWV:0.3145
Threshold: 0.043
Targets: 963 Correct: 405 False Alarms: 320 Miss: 558
Time Taken: 0.00333786010742


## Ex3

#### Pre-Decomposed Decode 

In [27]:
start = timer()
kw_idx_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
end = timer()
print("Index Entries: {} | Index Build Time: {}".format(len(kw_idx_morph.keys()),end - start))

query_list_morph = get_queries("../lib/kws/queries.xml", True)

start = timer()
query_results_morph = search_index(query_list_morph, kw_idx_morph)
end = timer()

_ = gen_output(query_results_morph, "../output/decode_morph.xml")
_ = get_scores("decode_morph")
print ("Time Taken: {}".format(end-start))

Index Entries: 785968 | Index Build Time: 9.23704099655
oov_count=251
all (488) - TWV:0.3183
iv (388) - TWV:0.3828
oov (100) - TWV:0.0678
short (321) - TWV:0.3265
long (167) - TWV:0.3025
word (288) - TWV:0.3397
phrase (200) - TWV:0.2875
Threshold: 0.301
Targets: 963 Correct: 410 False Alarms: 543 Miss: 553
Time Taken: 0.00819802284241


#### Dict-based Decomposition Decode 

In [28]:
start = timer()
kw_idx_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
end = timer()
print("Index Entries: {} | Index Build Time: {}".format(len(kw_idx_morph_manual.keys()),end - start))

query_list_morph = get_queries("../lib/kws/queries.xml", True)

start = timer()
query_results_morph_manual = search_index(query_list_morph, kw_idx_morph_manual)
end = timer()

_ = gen_output(query_results_morph_manual, "../output/decode_morph_manual.xml")
_ = get_scores("decode_morph_manual")
print ("Time Taken: {}".format(end-start))

Index Entries: 790910 | Index Build Time: 13.8114390373
oov_count=238
all (488) - TWV:0.3156
iv (388) - TWV:0.3921
oov (100) - TWV:0.0189
short (321) - TWV:0.3064
long (167) - TWV:0.3333
word (288) - TWV:0.3152
phrase (200) - TWV:0.3163
Threshold: 0.205
Targets: 963 Correct: 417 False Alarms: 602 Miss: 546
Time Taken: 0.00692296028137


## Ex4

#### Word-Based Decode with STO

In [25]:
kw_idx_word = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")

start = timer()
query_results_sto = search_index(query_list, kw_idx_word)
#kst2_list(query_results_sto)
end = timer()

_ = gen_output(query_results_sto, "../output/decode_sto.xml")
_ = get_scores("decode_sto", True)
print ("Time Taken: {}".format(end-start))

oov_count=246
0.320 & 0.402 & 0.000 & 0.313 & 0.332 & 0.315 & 0.323 & 0.167\\\hline
963 & 405 & 320 & 558\\\hline
Time Taken: 0.00280213356018


#### Pre-Decomposed Decode with STO

In [14]:
keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)

start = timer()
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
normalize_list(query_results_morph_sto,1)
kst2_list(query_results_morph_sto)
end = timer()

_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
_ = get_scores("decode_morph_sto", True)
print ("Time Taken: {}".format(end-start))

oov_count=251
0.326 & 0.392 & 0.068 & 0.335 & 0.307 & 0.291 & 0.350 & 0.048\\\hline
963 & 395 & 294 & 568\\\hline
Time Taken: 0.00640797615051


#### Dict-based Decomposition with STO

In [27]:
keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)

start = timer()
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
#kst2_list(query_results_morph_manual_sto)
end = timer()

_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
_ = get_scores("decode_morph_manual_sto", True)
print ("Time Taken: {}".format(end-start))

oov_count=238
0.316 & 0.392 & 0.019 & 0.306 & 0.333 & 0.316 & 0.315 & 0.205\\\hline
963 & 417 & 602 & 546\\\hline
Time Taken: 0.0039119720459


In [None]:
#Gamma Experimentation
gamma_list = [(x)/10.0 for x in range (0,51,5)]
res_types=['all','iv','oov','long','short','phrase','word']

#Word Avg
res_dict_word_mult = {}
for res_type in res_types:
    res_dict_word_mult[res_type] = []
res_dict_word_mult['threshold']=[]

for i,gamma in enumerate(gamma_list):
    kw_idx_word = create_index("../lib/ctms/decode.ctm", False, False)
    query_list = get_queries("../lib/kws/queries.xml")
    query_results_sto = search_index(query_list, kw_idx_word)
    print gamma
    normalize_list(query_results_sto,gamma)
    _ = gen_output(query_results_sto, "../output/decode_sto.xml")
    res = get_scores("decode_sto")
    
    for res_type in res_types:
        res_dict_word_mult[res_type].append(res[res_type][0])
    res_dict_word_mult['threshold'].append(res['all'][1])
        
#Word Mult
res_dict_word_avg = {}
for res_type in res_types:
    res_dict_word_avg[res_type] = []
res_dict_word_avg['threshold']=[]
    
for i,gamma in enumerate(gamma_list):
    kw_idx_word = create_index("../lib/ctms/decode.ctm", True, False)
    query_list = get_queries("../lib/kws/queries.xml")
    query_results_sto = search_index(query_list, kw_idx_word)
    print gamma
    normalize_list(query_results_sto,gamma)
    _ = gen_output(query_results_sto, "../output/decode_sto.xml")
    res = get_scores("decode_sto")
    
    for res_type in res_types:
        res_dict_word_avg[res_type].append(res[res_type][0])
    res_dict_word_avg['threshold'].append(res['all'][1])
        
#Pre-decomposed Morph Mult
res_dict_morph_mult = {}
for res_type in res_types:
    res_dict_morph_mult[res_type] = []
res_dict_morph_mult['threshold']=[]

for i,gamma in enumerate(gamma_list):
    print gamma
    keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", False, False)
    query_list_morph = get_queries("../lib/kws/queries.xml", True)
    query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
    normalize_list(query_results_morph_sto,gamma)
    _ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
    res = get_scores("decode_morph_sto")
    
    for res_type in res_types:
        res_dict_morph_mult[res_type].append(res[res_type][0])
    res_dict_morph_mult['threshold'].append(res['all'][1])
        
#Pre-decomposed Morph Avg
res_dict_morph_avg = {}
for res_type in res_types:
    res_dict_morph_avg[res_type] = []
res_dict_morph_avg['threshold']=[]

for i,gamma in enumerate(gamma_list):
    print gamma
    keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
    query_list_morph = get_queries("../lib/kws/queries.xml", True)
    query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
    normalize_list(query_results_morph_sto,gamma)
    _ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
    res = get_scores("decode_morph_sto")
    
    for res_type in res_types:
        res_dict_morph_avg[res_type].append(res[res_type][0])
    res_dict_morph_avg['threshold'].append(res['all'][1])
        
#Manual Morph Mult
res_dict_morph_manual_mult = {}
for res_type in res_types:
    res_dict_morph_manual_mult[res_type] = []
res_dict_morph_manual_mult['threshold']=[]

for i,gamma in enumerate(gamma_list):
    print gamma
    keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", False, True)
    query_list_morph = get_queries("../lib/kws/queries.xml", True)
    query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
    normalize_list(query_results_morph_manual_sto,gamma)
    _ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
    res = get_scores("decode_morph_manual_sto")
    
    for res_type in res_types:
        res_dict_morph_manual_mult[res_type].append(res[res_type][0])
    res_dict_morph_manual_mult['threshold'].append(res['all'][1])
        
#Manual Morph Avg
res_dict_morph_manual_avg = {}
for res_type in res_types:
    res_dict_morph_manual_avg[res_type] = []
res_dict_morph_manual_avg['threshold']=[]

for i,gamma in enumerate(gamma_list):
    print gamma
    keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
    query_list_morph = get_queries("../lib/kws/queries.xml", True)
    query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
    normalize_list(query_results_morph_manual_sto,gamma)
    _ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
    res = get_scores("decode_morph_manual_sto")
    
    for res_type in res_types:
        res_dict_morph_manual_avg[res_type].append(res[res_type][0])
    res_dict_morph_manual_avg['threshold'].append(res['all'][1])

In [85]:
print_types=['all','iv','oov','threshold']
hd_text = "morph_manual_mult"
nres = res_dict_morph_manual_mult
for tp in print_types:
    print "{}_{} = [{}];".format(hd_text, tp, ", ".join([str(round(x,4)) for x in nres[tp]]))

morph_manual_mult_all = [0.3206, 0.3197, 0.3189, 0.3194, 0.3182, 0.3165, 0.3163, 0.3157, 0.3149, 0.3136, 0.3141];
morph_manual_mult_iv = [0.3986, 0.3975, 0.3965, 0.3971, 0.3956, 0.3934, 0.3931, 0.3924, 0.3914, 0.3898, 0.3904];
morph_manual_mult_oov = [0.018, 0.018, 0.018, 0.018, 0.018, 0.018, 0.018, 0.018, 0.018, 0.018, 0.018];
morph_manual_mult_threshold = [0.038, 0.04, 0.043, 0.049, 0.038, 0.01, 0.007, 0.005, 0.001, 0.001, 0.0];


In [42]:
##No normalization
kw_idx_word = create_index("../lib/ctms/decode.ctm", False, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
n1m = get_scores("decode_sto",True)
kw_idx_word = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
n1a = get_scores("decode_sto",True)

keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", False, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
n2m = get_scores("decode_morph_sto",True)
keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
n2a = get_scores("decode_morph_sto",True)

keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", False, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
n3m = get_scores("decode_morph_manual_sto",True)
keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
n3a = get_scores("decode_morph_manual_sto",True)

##KST

kw_idx_word = create_index("../lib/ctms/decode.ctm", False, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
kst_list(query_results_sto)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
k1m = get_scores("decode_sto",True)
kw_idx_word = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
kst_list(query_results_sto)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
k1a = get_scores("decode_sto",True)

keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", False, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
kst_list(query_results_morph_sto)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
k2m = get_scores("decode_morph_sto",True)
keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
kst_list(query_results_morph_sto)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
k2a = get_scores("decode_morph_sto",True)

keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", False, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
kst_list(query_results_morph_manual_sto)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
k3m = get_scores("decode_morph_manual_sto",True)
keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
kst_list(query_results_morph_manual_sto)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
k3a = get_scores("decode_morph_manual_sto",True)

##STO

kw_idx_word = create_index("../lib/ctms/decode.ctm", False, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
normalize_list(query_results_sto,1)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
s1m = get_scores("decode_sto",True)
kw_idx_word = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
normalize_list(query_results_sto,1)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
s1a = get_scores("decode_sto",True)

keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", False, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
normalize_list(query_results_morph_sto,1)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
s2m = get_scores("decode_morph_sto",True)
keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
normalize_list(query_results_morph_sto,1)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
s2a = get_scores("decode_morph_sto",True)

keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", False, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
normalize_list(query_results_morph_manual_sto,1)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
s3m = get_scores("decode_morph_manual_sto",True)
keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
normalize_list(query_results_morph_manual_sto,1)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
s3a = get_scores("decode_morph_manual_sto",True)

##KST + STO

kw_idx_word = create_index("../lib/ctms/decode.ctm", False, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
kst_list(query_results_sto)
normalize_list(query_results_sto,1)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
ks1m = get_scores("decode_sto",True)
kw_idx_word = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
kst_list(query_results_sto)
normalize_list(query_results_sto,1)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
ks1a = get_scores("decode_sto",True)

keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", False, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
kst_list(query_results_morph_sto)
normalize_list(query_results_morph_sto,1)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
ks2m = get_scores("decode_morph_sto",True)
keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
kst_list(query_results_morph_sto)
normalize_list(query_results_morph_sto,1)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
ks2a = get_scores("decode_morph_sto",True)

keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", False, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
kst_list(query_results_morph_manual_sto)
normalize_list(query_results_morph_manual_sto,1)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
ks3m = get_scores("decode_morph_manual_sto",True)
keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
kst_list(query_results_morph_manual_sto)
normalize_list(query_results_morph_manual_sto,1)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
ks3a = get_scores("decode_morph_manual_sto",True)

## STO + KST

kw_idx_word = create_index("../lib/ctms/decode.ctm", False, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
normalize_list(query_results_sto,1)
kst_list(query_results_sto)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
sk1m = get_scores("decode_sto",True)
kw_idx_word = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
normalize_list(query_results_sto,1)
kst_list(query_results_sto)
_ = gen_output(query_results_sto, "../output/decode_sto.xml")
sk1a = get_scores("decode_sto",True)

keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", False, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
normalize_list(query_results_morph_sto,1)
kst_list(query_results_morph_sto)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
sk2m = get_scores("decode_morph_sto",True)
keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
normalize_list(query_results_morph_sto,1)
kst_list(query_results_morph_sto)
_ = gen_output(query_results_morph_sto, "../output/decode_morph_sto.xml")
sk2a = get_scores("decode_morph_sto",True)

keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", False, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
normalize_list(query_results_morph_manual_sto,1)
kst_list(query_results_morph_manual_sto)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
sk3m = get_scores("decode_morph_manual_sto",True)
keyword_index_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_sto = search_index(query_list_morph, keyword_index_morph_manual)
normalize_list(query_results_morph_manual_sto,1)
kst_list(query_results_morph_manual_sto)
_ = gen_output(query_results_morph_manual_sto, "../output/decode_morph_manual_sto.xml")
sk3a = get_scores("decode_morph_manual_sto",True)

oov_count=246
0.319 & 0.401 & 0.000 & 0.312 & 0.332 & 0.314 & 0.322 & 0.043\\\hline
963 & 405 & 320 & 558\\\hline
oov_count=246
0.320 & 0.402 & 0.000 & 0.313 & 0.332 & 0.315 & 0.323 & 0.167\\\hline
963 & 405 & 320 & 558\\\hline
oov_count=251
0.318 & 0.383 & 0.068 & 0.327 & 0.303 & 0.287 & 0.340 & 0.061\\\hline
963 & 410 & 543 & 553\\\hline
oov_count=251
0.318 & 0.383 & 0.068 & 0.326 & 0.303 & 0.287 & 0.340 & 0.301\\\hline
963 & 410 & 543 & 553\\\hline
oov_count=238
0.314 & 0.390 & 0.018 & 0.304 & 0.333 & 0.316 & 0.313 & 0.043\\\hline
963 & 417 & 602 & 546\\\hline
oov_count=238
0.316 & 0.392 & 0.019 & 0.306 & 0.333 & 0.316 & 0.315 & 0.205\\\hline
963 & 417 & 602 & 546\\\hline
oov_count=246
0.320 & 0.403 & 0.000 & 0.314 & 0.331 & 0.314 & 0.324 & 0.114\\\hline
963 & 397 & 275 & 566\\\hline
oov_count=246
0.320 & 0.403 & 0.000 & 0.315 & 0.332 & 0.314 & 0.325 & 0.167\\\hline
963 & 399 & 280 & 564\\\hline
oov_count=251
0.321 & 0.386 & 0.068 & 0.330 & 0.303 & 0.288 & 0.343 & 0.061\\\hline
963 

In [48]:
print "Word & Average & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f}& {:1.3f}\\\\\hline".format(n1a['all'][0],k1a['all'][0],s1a['all'][0],ks1a['all'][0],sk1a['all'][0])
print "Word & Multiply & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f}& {:1.3f}\\\\\hline".format(n1m['all'][0],k1m['all'][0],s1m['all'][0],ks1m['all'][0],sk1m['all'][0])
print "Morph-Based ASR & Average & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f}& {:1.3f}\\\\\hline".format(n2a['all'][0],k2a['all'][0],s2a['all'][0],ks2a['all'][0],sk2a['all'][0])
print "Morph-Based ASR & Multiply & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f}& {:1.3f}\\\\\hline".format(n2m['all'][0],k2m['all'][0],s2m['all'][0],ks2m['all'][0],sk2m['all'][0])
print "ASR + Morph-Dict & Average & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f}& {:1.3f}\\\\\hline".format(n3a['all'][0],k3a['all'][0],s3a['all'][0],ks3a['all'][0],sk3a['all'][0])
print "ASR + Morph-Dict & Multiply & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f} & {:1.3f}& {:1.3f}\\\\\hline".format(n3m['all'][0],k3m['all'][0],s3m['all'][0],ks3m['all'][0],sk3m['all'][0])

Word & Average & 0.320 & 0.320 & 0.320 & 0.320 & 0.295& 0.295\\\hline
Word & Multiply & 0.319 & 0.320 & 0.320 & 0.320 & 0.296& 0.296\\\hline
Morph-Based ASR & Average & 0.318 & 0.320 & 0.326 & 0.326 & 0.302& 0.302\\\hline
Morph-Based ASR & Multiply & 0.318 & 0.321 & 0.326 & 0.326 & 0.299& 0.299\\\hline
ASR + Morph-Dict & Average & 0.316 & 0.317 & 0.320 & 0.320 & 0.298& 0.298\\\hline
ASR + Morph-Dict & Multiply & 0.314 & 0.317 & 0.319 & 0.318 & 0.298& 0.298\\\hline


## Ex5

#### Create Grapheme Dict and Save

In [15]:
from math import log

class grapheme_dict:
    def __init__(self):
        self.dict = [[0 for x in range(27)] for y in range(27)] 
        self.total = [0 for x in range(27)]
        
    def update(self,hyp,ref,score_str):
        h = self.get_index(hyp)
        r = self.get_index(ref)
        score = (float(score_str))
        self.dict[h][r] = score 
        
    def normalize(self):
        self.total = [0 for x in range(27)]
        for i in range(27):
            for j in range(27):
                self.total[i] = self.total[i] + self.dict[i][j]
        for i in range(27):
            for j in range(27):
                val = self.dict[i][j]/self.total[i]
                if (val!=0):
                     self.dict[i][j] = -log(val)
                
    def get_dist(self,hyp,ref,oov_first=True):
        if hyp==' ':
            hyp='sil'
        if ref==' ':
            ref='sil'
        if(oov_first):
            h = self.get_index(hyp)
            r = self.get_index(ref)
        else:
            r = self.get_index(hyp)
            h = self.get_index(ref)
        
        return self.dict[h][r]
                
    def create_fst(self):
        fst_compiler_list = []
        for i in range(27):
            for j in range(27):
                if (self.dict[i][j] != 0):
                    c_i = self.get_char(i)
                    c_j = self.get_char(j)
                    if c_i == 'sil':
                        c_i = '<eps>'
                    if c_j == 'sil':
                        c_j = '<eps>'
                    fst_compiler_list.append("{} {} {} {} {}".format(0,0,c_i,c_j,(self.dict[i][j])))
        fst_compiler_list.append(str(0))
        return fst_compiler_list
    
    def symbol_list(self):
        symbol_list = []
        for i in range(27):
            symbol_list.append(self.get_char(i))
        return symbol_list
    
    def get_char(self, x):
        if x==0:
            return 'sil'
        else:
            return chr(x+96)
        
    def get_index(self, x):
        if x=='sil':
            return 0
        else:
            return ord(x)-96  
    
    def print_file(self,filename):
        arr = list("abcdefghijklmnopqrstuvwxyz ")
        with open(filename, 'w') as myfile:
            for ch1 in arr:
                for ch2 in arr:
                    myfile.write("{} {} {}\n".format('{' if ch1 == " " else ch1, '{' if ch2 == " " else ch2, self.get_dist(ch1,ch2)))

In [16]:
with open ("../lib/kws/grapheme.map", "r") as myfile:
    data=myfile.readlines()

g_dict = grapheme_dict()
for line in data:
    ref, hyp, score_str = line.split()
    g_dict.update(hyp,ref,score_str)
g_dict.normalize()
g_dict.print_file("./C/gmap.txt")

#### Find IV replacements for OOV words using edit distance

In [33]:
def find_replacements_final(oov_word_unfiltered, iv_list_unfiltered, g_dict, threshold, allowed_len_diff):
    #threshold=15
    #allowed_len_diff=5
    oov_word = "".join([x for x in oov_word_unfiltered if str.isalpha(x) or x==' '])
    iv_list = [x for x in iv_list_unfiltered if abs(len(x)-len(oov_word_unfiltered))<=allowed_len_diff]
    min_dist_word=""
    min_dist_score=1000

    for hyp_word_unfiltered in iv_list:
        hyp_word = "".join([x for x in hyp_word_unfiltered if str.isalpha(x) or x==' '])
        space_score=[]
        #spaces = []
        #scores = []
        if (len(hyp_word)>len(oov_word)):
            maxlen=len(hyp_word)
            minlen=len(oov_word)
            pad_word=oov_word
            org_word=hyp_word
            iv_first=True;
        else:
            maxlen=len(oov_word)
            minlen=len(hyp_word)
            pad_word=hyp_word
            org_word=oov_word
            iv_first=False;
            #(org_word_char, pad_word_char, iv_first)

        len_diff = maxlen-minlen
        space_score.append((len_diff,0))
        #repl_counter=0
        #print len_diff
        
        search = True
        char_idx=0
        while(search):
        #for char_idx in range(0,maxlen):
            #print char_idx, len(space_score)
            min_score=[1000 for x in range(0,30)]
            new_space_score = []
            
            #print "=== {}".format(char_idx)
            #print space_score

            for repl_word in space_score:
                #substitution
                pad_word_pos = char_idx - (len_diff - repl_word[0])
                if(pad_word_pos<minlen):
                    #print pad_word_pos
                    new_score = repl_word[1]+g_dict.get_dist(pad_word[pad_word_pos],org_word[char_idx],iv_first)
                    new_space_score.append((repl_word[0],new_score))
                    #print (org_word[char_idx],pad_word[pad_word_pos],new_score)
                    if min_score[repl_word[0]]>new_score:
                        min_score[repl_word[0]]=new_score


                #padding
                if(repl_word[0]>0):
                    new_score = repl_word[1]+g_dict.get_dist(' ',org_word[char_idx],iv_first)
                    new_space_score.append((repl_word[0]-1,new_score))
                    if min_score[repl_word[0]-1]>new_score:
                        min_score[repl_word[0]-1]=new_score
            #print new_space_score
            space_score=[]
            
            #print "==========={}".format(len(new_space_score))
            for repl_word in new_space_score:
                if (repl_word[1]<=min(min_score[repl_word[0]], min_dist_score, threshold)):
                    space_score.append(repl_word)
            char_idx+=1
            
            if (char_idx==maxlen or len(space_score)==0):
                search=False
            #print "==========={}".format(len(space_score))
            
        #extract lowest distance word
        min_word_sc=1000
        for repl_word in space_score:
            if repl_word[1]<min_word_sc:
                min_word_sc=repl_word[1]
        #print (hyp_word_unfiltered, pad_word, org_word, min_word_sc)
        if min_word_sc<min_dist_score:
            min_dist_score=min_word_sc
            min_dist_word=hyp_word_unfiltered
    return min_dist_word, min_dist_score

#### Updated Index Searching Functions for OOV

In [34]:
import subprocess

def save_oov_list(oov_list_unfiltered, filename):
    maxlen=28
    oov_list = [x for x in oov_list_unfiltered if len(x)<maxlen]
    with open(filename, 'w') as myfile:
        myfile.write("{} {}\n".format(maxlen+2, len(oov_list)))
        for word in oov_list:
            word_filtered = "".join([ch for ch in word if (str.isalpha(ch) or ch==' ')])
            myfile.write("{}\n".format(word_filtered.replace(" ","{")))
    return

def read_results(filename):
    with open(filename, "r") as data:
        lines = data.readlines()
    return lines;

def save_iv_list(kw_idx, filename):
    maxlen=28
    iv_list = [x for x in kw_idx.keys() if len(x)<maxlen]
    with open(filename, 'w') as myfile:
        myfile.write("{} {}\n".format(maxlen+2, len(iv_list)))
        for word in iv_list:
            word_filtered = "".join([ch for ch in word if (str.isalpha(ch) or ch==' ')])
            myfile.write("{}\n".format(word_filtered.replace(" ","{")))
    return iv_list

def search_index_oov(search_list, indexed_dict, grapheme_dict, threshold, allowed_len_diff, gamma=0): 
    maxlen=28
    iv_list = [x for x in indexed_dict.keys() if len(x)<maxlen]
    total_search_time = 0
    oov_list = []
    for q in search_list:
        oov_count = 0
        if q.text in indexed_dict:
            q.res_list = indexed_dict[q.text]
        else:
            #oov_list.append(q.text)
            start_t = timer()
            iv_term, iv_score = find_replacements_final(q.text, iv_list, grapheme_dict, threshold, allowed_len_diff)
            end_t = timer()
            print (q.text, iv_term, iv_score, end_t-start_t)
            total_search_time += end_t-start_t
            if iv_score<=threshold:
                q.res_list=indexed_dict[iv_term]
        q.oov = oov_count
        if (gamma>0):
            q.normalize(gamma)
        #q.search_time = end - start
    return search_list#, oov_list


def search_index_oov_c(search_list, indexed_dict, grapheme_dict, threshold, allowed_len_diff, gamma=0): 
    total_search_time = 0
    oov_text_list = []
    oov_idx_list = []
    start = timer()
    oov_count = 0
    for i,q in enumerate(search_list):
        if q.text in indexed_dict:
            q.res_list = indexed_dict[q.text]
        else:
            oov_text_list.append(q.text)
            oov_idx_list.append(i)
            oov_count +=1
        q.oov = 1
    print "oov_count={}".format(oov_count)
    iv_list = save_iv_list(indexed_dict, "./C/iv_list.txt")
    subprocess.call(["./C/oov_search",str(allowed_len_diff), str(threshold)])
    oov_results = parallel_oov_replacements(8, oov_text_list, threshold, allowed_len_diff)
    #save_oov_list(oov_text_list, "./C/oov_list.txt")
    #subprocess.call(["./C/oov_search",str(allowed_len_diff), str(threshold)])
    #oov_results = read_results("./C/oov_hyps.txt")
    for i,oov_res in enumerate(oov_results):
        #print (i,oov_res)
        idx=int(oov_res.split()[0])
        if (idx)>0:
            q=search_list[oov_idx_list[i]]
            iv_list_idx=int(oov_res.split()[0])
            #print (idx,iv_list_idx)
            q.res_list=indexed_dict[iv_list[iv_list_idx]]
            #print(q.text,iv_list[iv_list_idx])
    end = timer()
    #print (end - start)
    return search_list

def parallel_oov_replacements(num_proc, oov_list, threshold, allowed_len_diff):
    list_len = len(oov_list)
    limits=[]
    #save_iv_list(keyword_index, "./C/iv_list.txt")
    for i in range(0,list_len+1,list_len/num_proc):
        limits.append(i)
    limits[-1]=list_len
    #print limits
    oov_files=[]
    op_files=[]
    processes=set()
    for i in range(0,num_proc):
        save_oov_list(oov_list[limits[i]:limits[i+1]],"./C/oov_list_{}.txt".format(i))
        #print "./C/oov_list_{}.txt".format(i)
        #print (oov_list[limits[i]:limits[i+1]])
        oov_files.append("oov_list_{}.txt".format(i))
        op_files.append("oov_hyps_{}.txt".format(i))
    for i in range(0,num_proc):
        #print " ".join(["./oov_search",str(allowed_len_diff), str(threshold), oov_files[i], op_files[i]])
        processes.add(subprocess.Popen(["./oov_search",str(allowed_len_diff), str(threshold), oov_files[i], op_files[i]], cwd="./C"))
    for p in processes:
        if p.poll() is None:
            p.wait()
    oov_results=[]
    for i in range(0,num_proc):
        oov_results = oov_results + read_results("./C/{}".format(op_files[i]))
    subprocess.call(["rm","iv_list.txt"] + oov_files + op_files, cwd="./C")
    return oov_results

#### Word-Based Decode with OOV Replacements

In [42]:
keyword_index = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")

start = timer()
query_results_oov_c = search_index_oov_c(query_list, keyword_index, g_dict, 30, 7)
#kst_list(query_results_oov_c)
#normalize_list(query_results_oov_c,1)
end = timer()

_ = gen_output(query_results_oov_c, "../output/decode_oov.xml")
_ = get_scores("decode_oov")
print ("Time Taken: {}".format(end-start))

oov_count=246
all (488) - TWV:0.3735
iv (388) - TWV:0.4411
oov (100) - TWV:0.1112
short (321) - TWV:0.3324
long (167) - TWV:0.4526
word (288) - TWV:0.3614
phrase (200) - TWV:0.3909
Threshold: 0.167
Targets: 963 Correct: 451 False Alarms: 837 Miss: 512
Time Taken: 4.10971617699


#### Pre-decomposed Morphemes with OOV Replacements

In [102]:
keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml",True)

start = timer()
query_results_oov_morph_c = search_index_oov_c(query_list_morph, keyword_index_morph, g_dict, 30, 5)
kst_list(query_results_oov_morph_c)
normalize_list(query_results_oov_morph_c,1)
end = timer()

_ = gen_output(query_results_oov_morph_c, "../output/decode_morph_oov.xml")
_ = get_scores("decode_morph_oov")
print ("Time Taken: {}".format(end-start))

oov_count=251
all (488) - TWV:0.3763
iv (388) - TWV:0.4327
oov (100) - TWV:0.1574
short (321) - TWV:0.3376
long (167) - TWV:0.4507
word (288) - TWV:0.3601
phrase (200) - TWV:0.3995
Threshold: 0.055
Targets: 963 Correct: 446 False Alarms: 1273 Miss: 517
Time Taken: 13.5082330704


#### Manual Decomposed Morphemes with OOV Replacements

In [101]:
kw_idx_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)

start = timer()
query_results_morph_manual_oov = search_index_oov_c(query_list_morph, kw_idx_morph_manual, g_dict, 30, 5)
#kst_list(query_results_morph_manual_oov)
#normalize_list(query_results_morph_manual_oov,1)
end = timer()

_ = gen_output(query_results_morph_manual_oov, "../output/decode_morph_manual_oov.xml")
_ = get_scores("decode_morph_manual_oov")
print ("Time Taken: {}".format(end-start))

oov_count=238
all (488) - TWV:0.3571
iv (388) - TWV:0.4164
oov (100) - TWV:0.1270
short (321) - TWV:0.3069
long (167) - TWV:0.4537
word (288) - TWV:0.3427
phrase (200) - TWV:0.3778
Threshold: 0.205
Targets: 963 Correct: 457 False Alarms: 1194 Miss: 506
Time Taken: 12.2436859608


In [86]:
_ = get_scores("decode_oov",True)
_ = get_scores("decode_morph_oov", True)
_ = get_scores("decode_morph_manual_oov", True)

0.379 & 0.447 & 0.116 & 0.341 & 0.453 & 0.393 & 0.369 & 0.045\\\hline
963 & 444 & 733 & 519\\\hline
0.374 & 0.435 & 0.137 & 0.327 & 0.463 & 0.399 & 0.357 & 0.064\\\hline
963 & 443 & 890 & 520\\\hline
0.376 & 0.433 & 0.157 & 0.338 & 0.451 & 0.400 & 0.360 & 0.055\\\hline
963 & 446 & 1273 & 517\\\hline


  ## Experiment: System Combination

In [27]:
keyword_index = create_index("../lib/ctms/decode.ctm")
keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm")

In [17]:
import xml.etree.ElementTree as ET
def read_posting_list(filepath):
    tree = ET.parse(filepath)
    root = tree.getroot()
    search_dict = {}
    search_list = []
    for child in root:
        #print child.tag, child.attrib
        kwid = child.attrib['kwid']
        new_res = query(kwid, "unk", "unk")
        for res in child:
            score = float(res.attrib['score'])
            dur = float(res.attrib['dur'])
            filename = res.attrib['file']
            channel = int(res.attrib['channel'])
            start_time = float(res.attrib['tbeg'])
            new_res.res_list.append([filename, channel, start_time, dur, score])
        search_dict[kwid]=new_res
        #search_list.append(new_res)
    with open("../lib/kws/kwlist") as myfile:
        kwlist = myfile.read().splitlines()
    for kw in kwlist:
        if kw in search_dict.keys():
            search_list.append(search_dict[kw])
        else:
            search_list.append(query(kw, "unk", "unk"))
    return search_list

def sys_comb(query_results_q1, query_results_q2, w1=0.5, w2=0.5, use_max=False):
    if (use_max):
        w1=1.0
        w2=1.0
    search_list = []
    for q1, q2 in zip(query_results_q1, query_results_q2):
        c2 = query(q1.kwid, q1.text)
        #print q1.kwid, q2.kwid
        q1_res_list = [] 
        for hit in q1.res_list:
            q1_res_list.append(hit + [False])

        q2_res_list = []
        for hit in q2.res_list:
            q2_res_list.append(hit+[False])
        #print (q1.text, q2.text, len(q1.res_list),len(q2.res_list))
        new_res_list=[]

        #file, int(channel), float(startTime), float(duration), float(score)

        for q1_hit in q1_res_list:
            q1_start_time = q1_hit[2]
            q1_dur = q1_hit[3]
            q1_score = q1_hit[4]# * w1#Scoring Averages WCombSum - W1

            for q2_hit in q2_res_list:
                q2_start_time = q2_hit[2]
                q2_dur = q2_hit[3]
                q2_score = q2_hit[4]# * w2#Scoring W2

                #if abs(q1_start_time - q2_start_time) < 0.2:#Merge
                if (q1_start_time<=q2_start_time and q1_start_time+q1_dur>=q2_start_time) or (q2_start_time<=q1_start_time and q2_start_time+q2_dur>=q1_start_time):
                    #print q1_res_list
                    q1_hit[5]=True
                    q2_hit[5]=True
                    #print q1_res_list
                    new_score = q1_score * w1 + q2_score * w2
                    if(use_max):
                        new_score = max(q1_score,q2_score)
                    if q1_score>q2_score:
                        new_start_time = q1_start_time
                        new_duration = q1_dur
                    else:
                        new_start_time = q2_start_time
                        new_duration = q2_dur
                    #new_start_time = min(q1_start_time, q2_start_time)
                    #if q1_start_time<q2_start_time
                    #new_duration = max(q1_dur,q2_dur)
                    new_res = q1_hit[0:2] + [new_start_time, new_duration, new_score]
                    new_res_list.append(new_res)
            if(q1_hit[5]==False):
                new_res = q1_hit[0:4] + [q1_score * w1]
                new_res_list.append(new_res)

        for q2_hit in q2_res_list:
            if(q2_hit[5]==False):
                q2_score = q2_hit[4]# * w2
                new_res = q2_hit[0:4] + [q2_score *w2]
                new_res_list.append(new_res)

        c2.res_list = new_res_list
        search_list.append(c2)
    return search_list

In [18]:
morph_best_list = read_posting_list("../lib/kws/morph.xml")
normalize_list(morph_best_list,1)
_ = gen_output(morph_best_list, "../output/ref_morph.xml")
_ = get_scores("ref_morph", True)
word_best_list = read_posting_list("../lib/kws/word.xml")
normalize_list(word_best_list,1)
_ = gen_output(word_best_list, "../output/ref_word.xml")
_ = get_scores("ref_word", True)
word_sys2_best_list = read_posting_list("../lib/kws/word-sys2.xml")
normalize_list(word_sys2_best_list,1)
_ = gen_output(word_sys2_best_list, "../output/ref_word-sys2.xml")
_ = get_scores("ref_word-sys2", True)

0.520 & 0.559 & 0.367 & 0.496 & 0.565 & 0.512 & 0.526 & 0.039\\\hline
963 & 691 & 9463 & 272\\\hline
0.460 & 0.579 & 0.000 & 0.439 & 0.501 & 0.484 & 0.444 & 0.036\\\hline
963 & 715 & 13817 & 248\\\hline
0.465 & 0.585 & 0.000 & 0.450 & 0.495 & 0.487 & 0.450 & 0.030\\\hline
963 & 698 & 12649 & 265\\\hline


In [19]:
kw_idx_word = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_sto = search_index(query_list, kw_idx_word)
normalize_list(query_results_sto,1)

keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_sto = search_index(query_list_morph, keyword_index_morph)
normalize_list(query_results_morph_sto, 1)

word_morph_list = sys_comb(query_results_morph_sto, query_results_sto, 0.5,0.5)

_ = gen_output(word_morph_list, "../output/decode_morph_sto.xml")
_ = get_scores("decode_morph_sto")

oov_count=246
oov_count=251
all (488) - TWV:0.3616
iv (388) - TWV:0.4372
oov (100) - TWV:0.0680
short (321) - TWV:0.3602
long (167) - TWV:0.3643
word (288) - TWV:0.3721
phrase (200) - TWV:0.3464
Threshold: 0.038
Targets: 963 Correct: 450 False Alarms: 708 Miss: 513


By combining results from diverse ASR systems, we show good robustness across
a wide variety of talkers, channels, environments, and target terms.
Second, we compare score normalization approaches for STD. The
score normalization is relevant to data fusion since those scores provided
by the different systems are not comparable. Therefore, score
normalization is often performed as a preliminary step to data fusion.
http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6639278&tag=1

In [45]:
morph_twv = 0.359
word_twv=0.398
sys2_twv=0.403

word_morph_list = sys_comb(morph_best_list, word_best_list, 0.5, 0.5)#morph_twv/(morph_twv+word_twv), word_twv/(morph_twv+word_twv)
_ = gen_output(word_morph_list, "../output/ref_word_morph.xml")
_ = get_scores("ref_word_morph")

all (488) - TWV:0.4969
iv (388) - TWV:0.5315
oov (100) - TWV:0.3627
short (321) - TWV:0.4526
long (167) - TWV:0.5820
word (288) - TWV:0.4782
phrase (200) - TWV:0.5238
Threshold: 0.031
Targets: 963 Correct: 761 False Alarms: 22148 Miss: 202


In [46]:
word_morph_sys2 = sys_comb(word_morph_list, word_sys2_best_list, 1, 0.5)
_ = gen_output(word_morph_sys2, "../output/ref_word_morph_sys2.xml")
_ = get_scores("ref_word_morph_sys2")

all (488) - TWV:0.4828
iv (388) - TWV:0.5127
oov (100) - TWV:0.3669
short (321) - TWV:0.4434
long (167) - TWV:0.5585
word (288) - TWV:0.4661
phrase (200) - TWV:0.5068
Threshold: 0.056
Targets: 963 Correct: 779 False Alarms: 30364 Miss: 184


In [35]:
#SysComb Experiments
#Combine system 1-2, system 2-3, system 1-3, system 1,2,3
#WCombAvg, WCombMax, WcombTWVWeight
#4x3x2=24 experiments
import copy


#T1
m_w =0.5
w_w=0.5
s2_w=0.5
use_max=False;
"""
#T2
use_max=True

#T3

m_w = 0.52
w_w=0.46
s2_w=0.47
use_max=False;
"""

print_latex=True

wl = read_posting_list("../lib/kws/word.xml")
normalize_list(wl,1)
_ = gen_output(wl, "../output/ref_word.xml")
_ = get_scores("ref_word", print_latex)

ml = read_posting_list("../lib/kws/morph.xml")
normalize_list(ml,1)
_ = gen_output(ml, "../output/ref_morph.xml")
_ = get_scores("ref_morph", print_latex)

s2l = read_posting_list("../lib/kws/word-sys2.xml")
normalize_list(s2l,1)
_ = gen_output(s2l, "../output/ref_sys2.xml")
_ = get_scores("ref_sys2",print_latex)

wml = sys_comb(copy.deepcopy(wl), copy.deepcopy(ml), w_w/(m_w+w_w), m_w/(m_w+w_w), use_max)
_ = gen_output(wml, "../output/ref_word_morph.xml")
_ = get_scores("ref_word_morph",print_latex)

ms2l = sys_comb(copy.deepcopy(ml), copy.deepcopy(s2l), m_w/(m_w+s2_w), s2_w/(m_w+s2_w), use_max)
_ = gen_output(ms2l, "../output/ref_morph_sys2.xml")
_ = get_scores("ref_morph_sys2",print_latex)

ws2l = sys_comb(copy.deepcopy(wl), copy.deepcopy(s2l), w_w/(w_w+s2_w), s2_w/(w_w+s2_w), use_max)
_ = gen_output(ws2l, "../output/ref_word_sys2.xml")
_ = get_scores("ref_word_sys2",print_latex)

wms2l = sys_comb(copy.deepcopy(wml), copy.deepcopy(s2l), (m_w+w_w)/(m_w+w_w+s2_w), s2_w/(m_w+w_w+s2_w), use_max)
_ = gen_output(wms2l, "../output/ref_word_morph_sys2.xml")
_ = get_scores("ref_word_morph_sys2",print_latex)

0.330 & 0.415 & 0.000 & 0.302 & 0.383 & 0.371 & 0.301 & 0.045\\\hline
963 & 368 & 101 & 595\\\hline
0.335 & 0.394 & 0.108 & 0.313 & 0.378 & 0.353 & 0.323 & 0.067\\\hline
963 & 378 & 274 & 585\\\hline
0.346 & 0.435 & 0.000 & 0.326 & 0.384 & 0.388 & 0.317 & 0.054\\\hline
963 & 378 & 114 & 585\\\hline
0.375 & 0.443 & 0.108 & 0.346 & 0.429 & 0.398 & 0.358 & 0.035\\\hline
963 & 416 & 319 & 547\\\hline
0.386 & 0.457 & 0.108 & 0.367 & 0.421 & 0.409 & 0.369 & 0.060\\\hline
963 & 427 & 334 & 536\\\hline
0.363 & 0.457 & 0.000 & 0.339 & 0.410 & 0.407 & 0.332 & 0.036\\\hline
963 & 402 & 156 & 561\\\hline
0.399 & 0.474 & 0.108 & 0.374 & 0.447 & 0.427 & 0.380 & 0.039\\\hline
963 & 438 & 376 & 525\\\hline


In [37]:
#SysComb Experiments
#Combine system 1-2, system 2-3, system 1-3, system 1,2,3
#WCombAvg, WCombMax, WcombTWVWeight
#4x3x2=24 experiments
import copy


#T1
m_w =0.5
w_w=0.5
s2_w=0.5
use_max=False;
"""
#T2
use_max=True

#T3

m_w = 0.3790
w_w=0.3763
s2_w=0.3739
use_max=False;
"""

print_latex=False
"""
keyword_index = create_index("../lib/ctms/decode.ctm", True, False)
query_list = get_queries("../lib/kws/queries.xml")
query_results_oov_c = search_index_oov_c(query_list, keyword_index, g_dict, 30, 7)
kst_list(query_results_oov_c)
normalize_list(query_results_oov_c,1)
_ = gen_output(query_results_oov_c, "../output/decode_oov.xml")
_ = get_scores("decode_oov")

keyword_index_morph = create_index("../lib/ctms/decode-morph.ctm", True, False)
query_list_morph = get_queries("../lib/kws/queries.xml",True)
query_results_oov_morph_c = search_index_oov_c(query_list_morph, keyword_index_morph, g_dict, 30, 5)
kst_list(query_results_oov_morph_c)
normalize_list(query_results_oov_morph_c,1)
_ = gen_output(query_results_oov_morph_c, "../output/decode_morph_oov.xml")
_ = get_scores("decode_morph_oov")

kw_idx_morph_manual = create_index("../lib/ctms/decode.ctm", True, True)
query_list_morph = get_queries("../lib/kws/queries.xml", True)
query_results_morph_manual_oov = search_index_oov_c(query_list_morph, kw_idx_morph_manual, g_dict, 30, 5)
kst_list(query_results_morph_manual_oov)
normalize_list(query_results_morph_manual_oov,1)
_ = gen_output(query_results_morph_manual_oov, "../output/decode_morph_manual_oov.xml")
_ = get_scores("decode_morph_manual_oov")

wl = query_results_oov_c
ml = query_results_oov_morph_c
s2l = query_results_morph_manual_oov
"""
wml = sys_comb(copy.deepcopy(wl), copy.deepcopy(ml), w_w/(m_w+w_w), m_w/(m_w+w_w), use_max)
_ = gen_output(wml, "../output/ref_word_morph.xml")
_ = get_scores("ref_word_morph",print_latex)

ms2l = sys_comb(copy.deepcopy(ml), copy.deepcopy(s2l), m_w/(m_w+s2_w), s2_w/(m_w+s2_w), use_max)
_ = gen_output(ms2l, "../output/ref_morph_sys2.xml")
_ = get_scores("ref_morph_sys2",print_latex)

ws2l = sys_comb(copy.deepcopy(wl), copy.deepcopy(s2l), w_w/(w_w+s2_w), s2_w/(w_w+s2_w), use_max)
_ = gen_output(ws2l, "../output/ref_word_sys2.xml")
_ = get_scores("ref_word_sys2",print_latex)

wms2l = sys_comb(copy.deepcopy(wml), copy.deepcopy(s2l), (m_w+w_w)/(m_w+w_w+s2_w), s2_w/(m_w+w_w+s2_w), use_max)
_ = gen_output(wms2l, "../output/ref_word_morph_sys2.xml")
_ = get_scores("ref_word_morph_sys2",print_latex)

all (488) - TWV:0.4105
iv (388) - TWV:0.4684
oov (100) - TWV:0.1859
short (321) - TWV:0.3757
long (167) - TWV:0.4773
word (288) - TWV:0.4027
phrase (200) - TWV:0.4216
Threshold: 0.054
Targets: 963 Correct: 492 False Alarms: 1761 Miss: 471
all (488) - TWV:0.4049
iv (388) - TWV:0.4599
oov (100) - TWV:0.1912
short (321) - TWV:0.3567
long (167) - TWV:0.4974
word (288) - TWV:0.3853
phrase (200) - TWV:0.4330
Threshold: 0.055
Targets: 963 Correct: 487 False Alarms: 1713 Miss: 476
all (488) - TWV:0.3881
iv (388) - TWV:0.4486
oov (100) - TWV:0.1534
short (321) - TWV:0.3423
long (167) - TWV:0.4760
word (288) - TWV:0.3748
phrase (200) - TWV:0.4072
Threshold: 0.066
Targets: 963 Correct: 460 False Alarms: 1185 Miss: 503
all (488) - TWV:0.4126
iv (388) - TWV:0.4688
oov (100) - TWV:0.1944
short (321) - TWV:0.3690
long (167) - TWV:0.4964
word (288) - TWV:0.4013
phrase (200) - TWV:0.4288
Threshold: 0.04
Targets: 963 Correct: 498 False Alarms: 2002 Miss: 465


average

0.410 & 0.468 & 0.186 & 0.376 & 0.477 & 0.422 & 0.403 & 0.054\\\hline
963 & 492 & 1761 & 471\\\hline
0.405 & 0.460 & 0.191 & 0.357 & 0.497 & 0.433 & 0.385 & 0.055\\\hline
963 & 487 & 1713 & 476\\\hline
0.388 & 0.449 & 0.153 & 0.342 & 0.476 & 0.407 & 0.375 & 0.066\\\hline
963 & 460 & 1185 & 503\\\hline
0.413 & 0.469 & 0.194 & 0.369 & 0.496 & 0.429 & 0.401 & 0.040\\\hline
963 & 498 & 2002 & 465\\\hline

max

0.406 & 0.463 & 0.186 & 0.369 & 0.477 & 0.421 & 0.396 & 0.105\\\hline
963 & 492 & 1761 & 471\\\hline
0.404 & 0.457 & 0.198 & 0.355 & 0.498 & 0.434 & 0.384 & 0.063\\\hline
963 & 487 & 1713 & 476\\\hline
0.383 & 0.445 & 0.143 & 0.336 & 0.475 & 0.404 & 0.369 & 0.079\\\hline
963 & 460 & 1185 & 503\\\hline
0.409 & 0.462 & 0.204 & 0.365 & 0.495 & 0.428 & 0.397 & 0.105\\\hline
963 & 498 & 2002 & 465\\\hline