# Exercise 4: Speech Recognition System

## Importing and setting up system

### Setting up system

In [1]:
# Reload modules before executing user code before entering the execution of the code
%load_ext autoreload
%autoreload 2

### Importing required libraries 

In [2]:
# To access files 
# Offers classes representing filesystem paths for different Operating System
from pathlib import Path
# Allows us to run a commond in the Python script
import os

# Mathematical and data analytics libraries
import pandas as pd
import re
import numpy as np

# Provides runtime supports for type hints
from typing import Dict, List, Tuple
# Display progress bars
from tqdm import tqdm
# Provides access to unicode character database
import unicodedata
# Provide infrastructure for defining custom abstract base classes
from abc import ABC, abstractclassmethod

# Dealing with Audio data
# Read and write audio files
import soundfile as soundfile
# To play sound given file name or resource
from playsound import playsound
# To read audio files
from scipy.io import wavfile
# Music and Audio analysis
import librosa as librosa
#  Noise reduction algorithm to reduce noise
import noisereduce as nr
# Audio management libraries
from pydub import AudioSegment
from pydub.playback import play


# ASR algorithms 
# For deep neural networks
import torch
# Deep speech model 
import deepspeech as deepspeech
# Speech brain pretrained model
from speechbrain.pretrained import EncoderDecoderASR
# Automatic Sppech Recognition - self-supervised training
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Approximate the Word Error Rate (WER)
from jiwer import wer

  from .autonotebook import tqdm as notebook_tqdm
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
torchvision is not available - cannot save figures
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


## File Path

In [3]:
from pathlib import Path

modelPath = Path("./Models")
dataPath = Path("./required_files")
transcriptionsPath = dataPath / "Ex4_audio_files_transcriptions.csv"

# Text normalisation


In [4]:
def Text_Normalize(word):
    word = ''.join(i for i in unicodedata.normalize("NFD", word) if unicodedata.category(i) != "Mn")
    word = re.sub("[^A-Za-z0-9\ ]+", "", word)
    word = re.sub("[\s]+", " ", word)
    return word.lower().strip()

## Creating dataframe containing the normalized text

In [5]:
# Creating a dataframe, df_text to display file language, filename and normalised textual data 
df_text = pd.read_csv(transcriptionsPath , header = 0)
df_text['y_true'] = df_text.y_true.map(Text_Normalize)
df_text

Unnamed: 0,locale,filename,y_true
0,en,Ex4_audio_files/EN/checkin.wav,where is the checkin desk
1,en,Ex4_audio_files/EN/parents.wav,i have lost my parents
2,en,Ex4_audio_files/EN/suitcase.wav,please i have lost my suitcase
3,en,Ex4_audio_files/EN/what_time.wav,what time is my plane
4,en,Ex4_audio_files/EN/where.wav,where are the restaurants and shops
5,en,Ex4_audio_files/EN/your_sentence1.wav,where can i find the departure hall
6,en,Ex4_audio_files/EN/your_sentence2.wav,my luckage is missing from the baggage area
7,it,Ex4_audio_files/IT/checkin_it.wav,dove e il bancone
8,it,Ex4_audio_files/IT/parents_it.wav,ho perso i miei genitori
9,it,Ex4_audio_files/IT/suitcase_it.wav,per favore ho perso la mia valigia


## Remove background noise

## English audio

In [6]:
def remove_noise_en(aud_file):
    # Assigning file path of the English Language files 
    file_path = '/Users/karishma/Desktop/SIM Y3 Modules/Sem 2/ISP/Mid Term Assignments Complete/Exercise 4/required_files/Ex4_audio_files/EN/'
    
    #Assigning name of file to be the audio file, aud_file
    file_name = aud_file
    
    # Loading the audio file data
    sample_rate, aud_data = wavfile.read(file_path + file_name)
    
    # Reducing noise using the reduce_noise method in the noisereduce library
    noise_reduced = nr.reduce_noise(y=aud_data, sr = sample_rate)
    
    # Creating audio files with noice reduced audio 
    wavfile.write(aud_file, sample_rate, noise_reduced)
    
    # Replacing audio files with noises with the noise reduced audio files created
    os.replace("/Users/karishma/Desktop/SIM Y3 Modules/Sem 2/ISP/Mid Term Assignments Complete/Exercise 4/" + file_name, "/Users/karishma/Desktop/SIM Y3 Modules/Sem 2/ISP/Mid Term Assignments Complete/Exercise 4/required_files/Ex4_audio_files/EN/" + file_name)

In [7]:
remove_noise_en("checkin.wav")
remove_noise_en("parents.wav")
remove_noise_en("suitcase.wav")
remove_noise_en("what_time.wav")
remove_noise_en("where.wav")
remove_noise_en("your_sentence1.wav")
remove_noise_en("your_sentence2.wav")

# Abstract Base Classes


In [8]:
from abc import ABC, abstractclassmethod

class AbstractASR:
    @abstractclassmethod
    def transcribe(self, filepath: Path) -> str:
        raise ErrorMessage("Transcribe() method has not been implemented")

## DeepSpeech Model


In [9]:
import deepspeech as deepspeech
import librosa as librosa

class DeepSpeechModel(AbstractASR):
    model: deepspeech.Model

    def __init__(self, model: str, folder: Path, scorer: str = None):
        super(DeepSpeechModel, self).__init__()
        modelFilePath = folder / f"{model}.pbmm"
        self.model = deepspeech.Model(str(modelFilePath))
        if not scorer:
            scorer = model
        scorerFilePath = folder / f"{scorer}.scorer"
        if scorerFilePath.is_file():
            self.model.enableExternalScorer(str(scorerFilePath))

    def transcribe(self, filepath: Path):
        audiofile = librosa.load(filepath, sr=self.model.sampleRate())[0]
        audiofile = (audiofile * 32767).astype(np.int16)
        return self.model.stt(audiofile)


english_deepSpeech = DeepSpeechModel(model="deepspeech-0.9.3-models", folder=modelPath )
italian_deepSpeech = DeepSpeechModel(model="output_graph_it", folder=modelPath)
spanish_deepSpeech = DeepSpeechModel(model="output_graph_es", scorer="kenlm_es", folder=modelPath)


# Transcription


In [10]:
def getResults(df: pd.DataFrame, folder: Path, asr_engine: AbstractASR) -> pd.DataFrame:
    df = df.copy()
    for ix, filename in df.filename.iteritems():
        y_pred = asr_engine.transcribe(filepath=folder / filename)
        df.at[ix, "y_pred"] = y_pred
    return df

## English Word Error Rate Function

In [11]:
from jiwer import wer

def englishWER(): 
    values = range(7)
    fileNameArray = []
    ground_truth = []
    hypothesis = []
    werPercentage = []

    for i in values:
        fileNameArray.append(df_text.filename[i][19:])
        pred = ("en", english_deepSpeech.transcribe(filepath=dataPath / df_text[df_text.locale == "en"].filename.values[i]))[1]
        hypothesis.append(pred)
        truth = df_text.y_true.map(Text_Normalize)[i]
        ground_truth.append(truth)
        error = str(round(wer(ground_truth[i], hypothesis[i])*100, 2))
        werPercentage.append(error)

    list_of_tuples = list(zip(fileNameArray, ground_truth, hypothesis, werPercentage))
    list_of_tuples
    df = pd.DataFrame(list_of_tuples,
                      columns=['File', 'y_true', 'y_pred', 'WER%'])
    return df

## Italian Word Error Rate Function 

In [12]:
def italianWER():
    values = range(5)
    fileRange = range(7, 12)
    fileNameArray = []
    ground_truth = []
    hypothesis = []
    werPercentage = []

    for i in fileRange:
        fileNameArray.append(df_text.filename[i][19:])
        truth = df_text.y_true.map(Text_Normalize)[i]
        ground_truth.append(truth)

    for i in values:
        pred = ("it", italian_deepSpeech.transcribe(filepath=dataPath / df_text[df_text.locale == "it"].filename.values[i]))[1]
        hypothesis.append(pred)
        error = str(round(wer(ground_truth[i], hypothesis[i])*100, 2))
        werPercentage.append(error)

    list_of_tuples = list(zip(fileNameArray, ground_truth, hypothesis, werPercentage))
    list_of_tuples
    df = pd.DataFrame(list_of_tuples,
                      columns=['File', 'y_true', 'y_pred', 'WER%'])
    return df

## Spanish Word Error Rate Function 

In [13]:
def spanishWER():
    values = range(5)
    fileRange = range(12, 17)
    fileNameArray = []
    ground_truth = []
    hypothesis = []
    werPercentage = []

    for i in fileRange:
        fileNameArray.append(df_text.filename[i][19:])
        truth = df_text.y_true.map(Text_Normalize)[i]
        ground_truth.append(truth)

    for i in values:
        pred = ("es", spanish_deepSpeech.transcribe(filepath=dataPath / df_text[df_text.locale == "es"].filename.values[i]))[1]
        hypothesis.append(pred)
        error = str(round(wer(ground_truth[i], hypothesis[i])*100, 2))
        werPercentage.append(error)

    list_of_tuples = list(zip(fileNameArray, ground_truth, hypothesis, werPercentage))
    list_of_tuples
    df = pd.DataFrame(list_of_tuples,
                      columns=['File', 'y_true', 'y_pred', 'WER%'])
    return df

## English ASR results

In [15]:
englishResults = getResults(df_text[df_text.locale == "en"], dataPath, english_deepSpeech)
englishResults

Unnamed: 0,locale,filename,y_true,y_pred
0,en,Ex4_audio_files/EN/checkin.wav,where is the checkin desk,but
1,en,Ex4_audio_files/EN/parents.wav,i have lost my parents,
2,en,Ex4_audio_files/EN/suitcase.wav,please i have lost my suitcase,
3,en,Ex4_audio_files/EN/what_time.wav,what time is my plane,
4,en,Ex4_audio_files/EN/where.wav,where are the restaurants and shops,
5,en,Ex4_audio_files/EN/your_sentence1.wav,where can i find the departure hall,i
6,en,Ex4_audio_files/EN/your_sentence2.wav,my luckage is missing from the baggage area,


## English WER

In [16]:
englishWER()

Unnamed: 0,File,y_true,y_pred,WER%
0,checkin.wav,where is the checkin desk,but,100.0
1,parents.wav,i have lost my parents,,100.0
2,suitcase.wav,please i have lost my suitcase,,100.0
3,what_time.wav,what time is my plane,,100.0
4,where.wav,where are the restaurants and shops,,100.0
5,your_sentence1.wav,where can i find the departure hall,i,85.71
6,your_sentence2.wav,my luckage is missing from the baggage area,,100.0


## Italian ASR results

In [17]:
italianResults = getResults(df_text[df_text.locale == "it"], dataPath, italian_deepSpeech)
italianResults

Unnamed: 0,locale,filename,y_true,y_pred
7,it,Ex4_audio_files/IT/checkin_it.wav,dove e il bancone,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeadove ...
8,it,Ex4_audio_files/IT/parents_it.wav,ho perso i miei genitori,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeaoerso...
9,it,Ex4_audio_files/IT/suitcase_it.wav,per favore ho perso la mia valigia,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeeeaoes...
10,it,Ex4_audio_files/IT/what_time_it.wav,a che ora e il mio aereo,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeaa rag...
11,it,Ex4_audio_files/IT/where_it.wav,dove sono i ristoranti e i negozi,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeaadove...


## Italian WER

In [18]:
italianWER()

Unnamed: 0,File,y_true,y_pred,WER%
0,checkin_it.wav,dove e il bancone,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeadove ...,125.0
1,parents_it.wav,ho perso i miei genitori,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeaoerso...,80.0
2,suitcase_it.wav,per favore ho perso la mia valigia,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeeeaoes...,71.43
3,what_time_it.wav,a che ora e il mio aereo,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeaa rag...,100.0
4,where_it.wav,dove sono i ristoranti e i negozi,i i iiiiiieeeeeeeeeeeeeeeeeeeeaeeeeeaeeaadove...,85.71


## Spanish ASR results

In [19]:
spanishResults = getResults(df_text[df_text.locale == "es"], dataPath, spanish_deepSpeech)
spanishResults

Unnamed: 0,locale,filename,y_true,y_pred
12,es,Ex4_audio_files/ES/checkin_es.wav,donde estan los mostradores,
13,es,Ex4_audio_files/ES/parents_es.wav,he perdido a mis padres,
14,es,Ex4_audio_files/ES/suitcase_es.wav,por favor he perdido mi maleta,
15,es,Ex4_audio_files/ES/what_time_es.wav,a que hora es mi avion,
16,es,Ex4_audio_files/ES/where_es.wav,donde estan los restaurantes y las tiendas,


## Spanish WER

In [20]:
spanishWER()

Unnamed: 0,File,y_true,y_pred,WER%
0,checkin_es.wav,donde estan los mostradores,,100.0
1,parents_es.wav,he perdido a mis padres,,100.0
2,suitcase_es.wav,por favor he perdido mi maleta,,100.0
3,what_time_es.wav,a que hora es mi avion,,100.0
4,where_es.wav,donde estan los restaurantes y las tiendas,,100.0


# Average Word Error Rate

In [21]:
import jiwer

class AverageWER:
    df: pd.DataFrame

    def __init__(self, df: pd.DataFrame):
        self.df = df

    def getWER(self, asr_name: str = None) -> pd.DataFrame:
        y_pred_column = "y_pred"
        if asr_name:
            y_pred_column = f"{y_pred_column}_{asr_name}"
        wers = []
        for y_true, y_pred in zip(self.df.y_true, self.df[y_pred_column]):
            wers.append(jiwer.wer(y_true, y_pred))
        return 100. * sum(wers) / len(wers)

## Deep Speech Average WER For Languages


In [22]:
AverageWER(englishResults).getWER()

97.95918367346938

In [23]:
AverageWER(italianResults).getWER()

92.42857142857142

In [24]:
AverageWER(spanishResults).getWER()

100.0