# Error Visualization

- Quantifying Errors
	- From Determined Test Cases,  
For each word, measure the number of *correctly* and *incorrectly* interpreted
- Classifying Errors
	- Miss interpretation of the words  
For each word, log the transcription, what is the the misinterpretation of the error  
→ calculate the occurence  
→ most common error for a given word


In [1]:
import os
import numpy
import glob
import jiwer
import copy
import re, string
from normalise import normalise, tokenize_basic


def idx_to_file(idx):
    return "/".join(idx.split("-")[:-1])


def read_transcription(fpath):
    file = open(fpath)
    transcription = file.readline()
    file.close()

    return transcription


def remove_hex(text):
    """
    Example: 
    "\xe3\x80\x90Hello \xe3\x80\x91 World!"
    """
    res = []
    i = 0
    while i < len(text):
        if text[i] == "\\" and i+1 < len(text) and text[i+1] == "x":
            i += 3
            res.append(" ")
        else:
            res.append(text[i])
        i += 1
    return "".join(res)


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


def remove_multiple_whitespace(text):
    """
    remove multiple whitespace
    it covers tabs and newlines also
    """
    return re.sub(' +', ' ', text.replace('\n', ' ').replace('\t', ' ')).strip()


def normalize_text(text):
    return " ".join(normalise(text, tokenizer=tokenize_basic, verbose=False))


def substitute_word(text):
    """
    word subsitution to make it consistent
    """
    words = text.split(" ")
    preprocessed = []
    for w in words:
        substitution = ""
        if w == "mister":
            substitution = "mr"
        elif w == "missus":
            substitution = "mrs"
        else:
            substitution = w
        preprocessed.append(substitution)
    return " ".join(preprocessed)

def preprocess_text(text):
    text = text.lower()
    text = remove_hex(text)
    text = remove_punctuation(text)
    try:
        text = normalize_text(text)
    except:
        text = ""
    text = remove_punctuation(text)
    text = substitute_word(text)
    text = jiwer.RemoveMultipleSpaces()(text)
    text = jiwer.ExpandCommonEnglishContractions()(text)
    text = jiwer.RemoveWhiteSpace(replace_by_space=True)(
        text)  # must remove trailing space after it
    text = jiwer.Strip()(text)
    return text

# TODO: remove warning, put text preprocessing as helper



In [2]:
class Data:

    def __init__(self, name):
        self.name = name
        self.reference = []
        self.transcription = []


    def preprocess_text(self, text):
        text = text.lower()
        text = jiwer.RemoveMultipleSpaces()(text)
        text = jiwer.ExpandCommonEnglishContractions()(text)
        text = jiwer.RemoveWhiteSpace(replace_by_space=True)(text)  # must remove trailing space after it
        text = jiwer.Strip()(text)
        return text

    def get_reference(self):
        return self.reference

    def get_transcription(self):
        return self.transcription

    
    def add_reference(self, reference):
        self.reference.append(self.preprocess_text(reference))
    
    def add_transcription(self, transcription):
        self.transcription.append(self.preprocess_text(transcription))

    def add_reference_transcription(self, reference, transcription):
        self.add_reference(reference)
        self.add_transcription(transcription)
    
    def length(self):
        assert len(self.reference) == len(self.transcription)
        return len(self.reference)

    def print_reference_transcription(self, i):
        if i >= 0 and i < len(self.reference):
            print("Reference:   \t: ", self.reference[i])
            print("Transcription: \t: ", self.transcription[i])

    
    def print_head(self):
        self.print_reference_transcription(i=0)

    def print_tail(self):
        self.print_reference_transcription(i=self.length()-1)

In [3]:
def read_librispeech_data():
    data = Data("librispeech")
    root_dir = "LibriSpeech/test-clean/"
    model_dir = "deepspeech"


    for filename in glob.iglob(root_dir + '**/*.trans.txt', recursive=True):
        
        file = open(filename)

        for line in file.readlines():
            idx = line.split()[0]
            reference_text = " ".join(line.split()[1:])

            fname = os.path.join(root_dir, idx_to_file(idx), idx)
            transcription_path = fname + "." + model_dir + ".transcription.txt"
            if os.path.exists(transcription_path):
                transcription = read_transcription(transcription_path)
                data.add_reference_transcription(reference_text, transcription)
            else:
                raise ValueError("missing transcription: " + transcription_path)

        file.close()
    
    return data


In [4]:
data = read_librispeech_data()
data.print_head()
print()
data.print_tail()

Reference:   	:  he knew the silver fleece his and zora is must be ruined
Transcription: 	:  he knew the silver fleece his enormous be ruined

Reference:   	:  the pain produced by an act of hasty and angry violence to which a father subjects his son may soon pass away but the memory of it does not pass away with the pain
Transcription: 	:  the pain produced by an act of hasty and angry violence to which a father subjects his son may soon pass away but the memory of it does not pass away with the pain


In [5]:
def read_corpus(corpus_fpath: str):
    file = open(corpus_fpath)
    corpus = file.readlines()
    texts = []
    for text in corpus:
        texts.append(text[:-1])

    return texts

def read_crossasr_data():
    data = Data("crossasr")

    transcription_dir = "CrossASR/europarl-seed2021/data/transcription"
    tts_name = "rv"
    asr_name = "deepspeech"
    transcription_dir = os.path.join(transcription_dir, tts_name)
    transcription_dir = os.path.join(transcription_dir, asr_name)

    references = read_corpus("CrossASR/europarl-seed2021/corpus/europarl-20000.txt")
    
    for i in range(len(references)):
        transcription_path = os.path.join(transcription_dir, f"{i+1}.txt")
        transcription = read_transcription(transcription_path)

        data.add_reference_transcription(references[i], transcription)

    return data



In [6]:
data = read_crossasr_data()
data.print_head()
print()
data.print_tail()

Reference:   	:  in the european year for intercultural dialogue we should also recognize the importance of cultural industries in creating awareness and understanding of other cultures and therefore their importance for social cohesion
Transcription: 	:  in the european year for inter cultural dialogue we should also recognize the importance of cultural industries in creating awareness and understanding of other cultures and therefore their importance for social cohesion

Reference:   	:  it must be organised by a single body responsible for ensuring that it is applied comprehensively consistently and effectively
Transcription: 	:  it must be organized by a single body responsible for ensuring that it is applied comprehensively consistently and effectively


In [7]:
from asr_evaluation.asr_evaluation import asr_evaluation

class Analyzer:
    
    def analyze(self, data: Data):
        infos = []
        for reference, transcription, in zip(data.get_reference(), data.get_transcription()) :
            wer = jiwer.wer(reference, transcription)
            if wer != 0:
                evaluation = asr_evaluation.ASREvaluation()
                evaluation.detect_word_error(reference, transcription)
                confusion = evaluation.get_confusions()
                infos.append(
                    {"confusion": confusion, "reference": reference, "transcription": transcription})
        return infos

    def get_most_common_errors(self, data: Data):
        infos = self.analyze(data)
        
        common_errors = {}
        for info in infos:
            confusion = info["confusion"]
            if len(confusion["substitution"]) > 0:
                for i in range(len(confusion["substitution"])):
                    word_reference = confusion["substitution"][i]["word_reference"]
                    word_substitution = confusion["substitution"][i]["word_substitution"]
                    count = confusion["substitution"][i]["count"]
                    if word_reference in common_errors:
                        substitutions = common_errors[word_reference]
                        if word_substitution in substitutions:
                            common_errors[word_reference][word_substitution] = count + \
                                common_errors[word_reference][word_substitution]
                        else:
                            common_errors[word_reference][word_substitution] = count
                    else:
                        common_errors[word_reference] = {
                            word_substitution: count}

        ## TODO : put this in a separate function
        ## sort things inside a substitution error
        for common_error in common_errors.values():
            common_error = dict(sorted(common_error.items(),
                                            key=lambda item: item[1], reverse=True))

        ## sort words based on the highest occurence
        common_errors = dict(sorted(common_errors.items(),
                                        key=lambda item: list(item[1].values())[0], reverse=True))
        
        return common_errors

    def print_common_error(self, common_errors, limit=2):
        for word, common in common_errors.items():
            print("Word: ", word)
            print("Substituion: ")
            keys = list(common.keys())
            values = list(common.values())
            for i in range(min(limit, len(keys))):
                print(f"\t{keys[i]:10s} count: {values[i]}")


In [8]:
analyzer = Analyzer()

data = read_librispeech_data()
common_errors = analyzer.get_most_common_errors(data)
analyzer.print_common_error(common_errors)

Word:  in
Substituion: 
	and        count: 44
	living     count: 1
Word:  a
Substituion: 
	the        count: 34
	and        count: 3
Word:  this
Substituion: 
	the        count: 21
	guess      count: 1
Word:  an
Substituion: 
	and        count: 13
	on         count: 1
Word:  too
Substituion: 
	to         count: 10
	two        count: 3
Word:  two
Substituion: 
	to         count: 8
	lotto      count: 1
Word:  the
Substituion: 
	a          count: 7
	me         count: 2
Word:  men
Substituion: 
	man        count: 6
	then       count: 1
Word:  boolooroo
Substituion: 
	bolero     count: 6
	booleroo   count: 4
Word:  anyone
Substituion: 
	one        count: 6
Word:  uncas
Substituion: 
	once       count: 6
	on         count: 1
Word:  color
Substituion: 
	colour     count: 5
Word:  has
Substituion: 
	had        count: 5
	is         count: 2
Word:  it
Substituion: 
	i          count: 5
	was        count: 1
Word:  o
Substituion: 
	of         count: 5
	oh         count: 4
Word:  his
Substituion: 


In [9]:
data = read_crossasr_data()
common_errors = analyzer.get_most_common_errors(data)
analyzer.print_common_error(common_errors)
