In [None]:
# ! pip install --user google.cloud
# ! pip install --user google.cloud.speech

In [23]:
import os
import io
import pickle
import time

import pandas as pd
import numpy as np

from pydub import AudioSegment
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from multiprocessing.dummy import Pool

In [2]:
with open("list_of_roads.txt", "rb") as fp:
    list_of_roads = pickle.load(fp)

# Speech Recognition & Transcription

Before we can attempt to extract location we need to transcribe our audio. Here we used [Google Cloud Speech-to-Text](https://cloud.google.com/speech-to-text/) on our speech samples. Google's is a highly reputed and easy-to-use transcription API. It is also about as "black box" as it comes. The Speech-to-Text is a completely proprietary neural network that we only know by the output it gives us and the inputs it affords us. 

Notably, we are able to provide a vocabulary list. This list is used as what are known as "contextual embeddings". Contextual embeddings essentially skew the weight of the terms provided favorably over those that the speech recognition client would otherwise predict as the word spoken. Given we know the names of all roads we expect to hear, we use our scraped list as embeddings, thereby giving the otherwise "universal" speech recognition client a context. The embeddings do not deactivate the training of the neural network however, so we will see the impact that our embeddings have on the confidence score that the Google Cloud Speech-to-Text also provides.

In [9]:
pool = Pool(12) 
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./api_key.json"   
client = speech.SpeechClient()

### With Contextual Embeddings

Below we define two functions, one which retrieves files, and the other which transcribes files and expects `speech_contexts` to be provided. We then run the two functions using `pool` multiprocessing which increases the speed of the call. Our speech samples are transcribed and returned as a pandas dataframe. Finally we take the average confidence score that Google gives in order to score this run with the following.

In [27]:
def get_desired_files(directory_name):
    desired_files = []
    for filename in os.listdir(directory_name):
        if (os.path.getsize(directory_name + '/' + filename) < 2_600_000) & (filename.endswith('.wav')):
            desired_files.append(directory_name + '/' + filename)
    return desired_files

def transcribe_embeddings(file_name):
    transcript = ''
    conf = 0
    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    #speech_to_text
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='en-US',
        model="video",
        speech_contexts = [{
                        "phrases": np.random.choice(list_of_roads, 5000)
                         }]
    )

    # Detects speech in the audio file
    response = client.recognize(config, audio)

    for result in response.results:
        transcript = result.alternatives[0].transcript
        conf = result.alternatives[0].confidence

    time.sleep(1)
    
    return transcript, conf, file_name

pool = Pool(12) 

list_of_transcripts_context = pool.map(transcribe_embeddings, get_desired_files('./samples_1'))
pool.close()
pool.join()

transcripts_context = [a[0] for a in list_of_transcripts_context if a[0] != '']
confs_context = [a[1] for a in list_of_transcripts_context if a[0] != '']
names_context = [a[2] for a in list_of_transcripts_context if a[0] != '']

data_context = {'transcripts': transcripts_context, 
                'confidence': confs_context}
df_context = pd.DataFrame(data_context)
df_context.to_csv('./datasets/radio_context.csv')
df_context.head(25)

Unnamed: 0,transcripts,confidence
0,Stetson,0.542913
1,Isabelle never to mobile crisis or crisis in G...,0.741057
2,66 is on it,0.770346
3,silver foxes,0.444745
4,Pratt for theis wonderful,0.744394


In [34]:
np.mean(df_context['confidence'])

0.6713830406546055

### Without Contextual Embeddings

As above, we define one more function. This time we are doing transcription without including context. We again score it by taking the average of the confidence scores given by Google.

In [None]:
def transcribe(file_name):
    transcript = ''
    conf = 0
    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    #speech_to_text
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='en-US',
        model="video"
    )

    # Detects speech in the audio file
    response = client.recognize(config, audio)

    for result in response.results:
        transcript = result.alternatives[0].transcript
        conf = result.alternatives[0].confidence

    time.sleep(1)
    
    return transcript, conf, file_name

pool = Pool(12) 

list_of_transcripts = pool.map(transcribe, get_desired_files('./samples_1'))
pool.close()
pool.join()

transcripts = [a[0] for a in list_of_transcripts if a[0] != '']
confs = [a[1] for a in list_of_transcripts if a[0] != '']
names = [a[2] for a in list_of_transcripts if a[0] != '']

data = {'transcripts': transcripts, 
        'confidence': confs}
df = pd.DataFrame(data)
df.to_csv('./datasets/radio.csv')
df.head(25)

In [35]:
np.mean(df['confidence'])

0.7757142536223881