# Speech to Text
The purpose of this notebook is to prototype some code that can perform the following tasks

*Part 1: Listening for Speech and Storing speech in FIFO buffer*
- Detect and open the microphone on a MacBook Pro computer
- Listen for any spoken words.
- Create a list of strings that are all the spoken words 
- Add the heard words to a FIFO buuffer of heard words
- include information about the part of speech for each word in the FIFO buffer

*Part 2: Using FIFO buffer to construct prompts for Stable Diffusion*
- Generate or use predefined prompt structures and the FIFO of spoken words to generate Stable Diffusion prompts

*Part 3: Generate Images*
- Feed generated prompts into a stable diffusion network to create images based on recent conversations that occur in the proximity of the laptop

This notebook serves as a POC for an installation I am working on that passively listens to the environment it is installed and uses words spoken in the location and machine learning to create images based on what people are talking about =)


In [2]:
import pyaudio as audio
import speech_recognition as sr
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk import RegexpParser

##############################
only_unique_words = True
###############

# Download the required datafiles for the NLTK pos_tag function
nltk.download('averaged_perceptron_tagger')

# create a list of stopwords to ignore...
stopwords = set(['shan', 'same', "wasn't", "she's", 
                 'they', 'off', "needn't", "weren't", 
                 'as', 'some', 'and', 'from', 'other', 
                 "shouldn't", "shan't", 'to', 'does', 
                 'was', 'has', 'so', 'himself', 'do', 
                 'below', "doesn't", "that'll", 'its', 
                 'these', 'are', 'more', 'aren', 'all', 
                 'whom', 'shouldn', 'too', 'over', "you've", 
                 'him', 'o', 'his', 'be', "you'll", 'out', 
                 'against', 'most', 'if', 'hasn', 'own', 
                 's', 'what', 'theirs', 'or', "it's", 
                 'will', "don't", 'is', 'been', 'who', 
                 'yourselves', 'her', 'did', 'the', 'up', 
                 'there', 'ourselves', 'during', 'mightn', 
                 "you'd", 'further', 'very', 'those', 'for', 
                 'but', 'an', 'in', 'nor', "mightn't", 've', 
                 'both', 'until', 'isn', 'ain', "didn't", 
                 'than', 'themselves', 'myself', "couldn't", 
                 'now', 'herself', 'any', 'by', "wouldn't", 
                 'about', 'after', 'here', 'doesn', 'a', 
                 'which', 'd', 'y', 'were', 'couldn', 
                 "aren't", 'i', 'then', 'being', 'just', 
                 'our', "haven't", 't', 'wouldn', 're', 
                 "mustn't", 'while', 'with', 'only', 
                 'under', 'ma', 'again', 'can', 'ours', 
                 'through', "hadn't", 'when', 'hers', 
                 "isn't", 'of', 'few', 'my', 'had', 
                 'before', 'where', 'wasn', "should've", 
                 'she', 'your', 'haven', 'weren', 'on', 
                 'have', 'he', 'between', 'me', 'down', 
                 'should', 'mustn', 'their', 'am', 'above', 
                 'll', 'such', 'why', 'no', 'you', 'it', 
                 'because', 'into', 'm', "you're", 'that', 
                 'itself', 'not', 'hadn', "won't", 'we', 
                 'don', 'doing', 'won', 'them', 'this', 
                 "hasn't", 'how', 'at', 'needn', 'once', 
                 'having', 'yours', 'each', 'yourself', 'didn'])

print("stop_words: ", stopwords)

stemmer = PorterStemmer()

# create a queue of the last 100 words identified by the program
# recent_text_q is a list of dicts with three values: word, type, and freq
recent_text_q = []
max_q_len = 100

# activate macbook microphone stream
# create a speech recognition object
recognizer = sr.Recognizer()


stop_words:  {'about', 'whom', 'very', 'each', "you'd", "that'll", 'd', 'of', 'under', 'should', "shan't", 'won', 'between', 'their', 'no', "couldn't", 'just', 'now', "won't", 'with', 'hadn', 'than', 'our', "shouldn't", 'o', 'some', "doesn't", 'not', 'herself', 'further', 'i', "hasn't", 'which', 'shouldn', 'aren', 'being', "haven't", 'did', 'hasn', 'is', 'from', 've', 'be', 'again', "she's", 'wasn', 'been', 'mightn', 'nor', 'to', 'ourselves', 'below', 'shan', 'wouldn', 'against', 'her', 'will', "mightn't", "isn't", 'needn', 'off', 'while', 'most', 'does', 'you', 'so', 'him', 'the', 'how', 'there', 'too', 'any', "wouldn't", 'myself', 'and', 'your', 'they', 'above', "needn't", 'both', 'itself', 'it', 'own', 'on', 'in', 'was', 'such', 'had', 'himself', 're', 'y', 'doesn', 'before', 'an', 'over', 'more', "you'll", 'haven', "you're", 'because', 'yours', "it's", 'why', "mustn't", 'as', 'who', 'same', 'themselves', 'has', 'up', 'ain', "wasn't", 'what', 'll', 'all', 'through', 'ours', 'have', 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nathan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:


################################################################
## figure out what audio device our microphone is
def getMacbookProMic():
    print(sr.Microphone.list_microphone_names())
    internal_macbook_name = "MacBook Pro Microphone"
    index = sr.Microphone.list_microphone_names().index(internal_macbook_name)

    print("should be the internal microphone: ", sr.Microphone.list_microphone_names()[index])

    # create a microphone object
    internal_mic = sr.Microphone(device_index = index)
    return internal_mic

def fifoInDict(lst, val, tag, max_len):
    # check if item needs to be removed
    if len(lst) >= max_len:
        lst.pop(0)
    # update the dictionary
    temp_dict = {'word': val, 'type': tag, 'freq': 1}
    lst.append(temp_dict)
    return lst

def fifoInLst(lst, val, max_len):
    # check if item needs to be removed
    if len(lst) >= max_len:
        lst.pop(0)
    # update the dictionary
    lst.append(val)
    return lst

def speechToText(mic, recognizer):
    """
    Input : mic = st.Microphone() object where an audio stream can be read
            recognizer = sr.Recognizer() object that takes in a audio clip and returns a list of words
    Output: words = list of words that are identified from words spoken into the microphone
    """
    # capture audio from the microphone
    with mic as source:
        # adjust for background noise to increase success rate
        recognizer.adjust_for_ambient_noise(source)
        # identify any spoken words in the audio
        print("Speech Recognizer Enabled");
        audio = recognizer.listen(source)
        print("audio exported from source")
        raw_string = "the the "
        try:
            raw_string = recognizer.recognize_google(audio)
        except:
            print(" .")

        print("{} of tokenized words returned from google: {}".format(type(raw_string), raw_string))
        # remove words from string

        return_str = raw_string.split()
        wn = len(return_str)
        # remove any words in the stopwords
        words = [i for i in return_str if i not in stopwords]
        print("{} words removed from stopwords".format(wn - len(words)))
        words = [i for i in return_str if i not in recent_text_q]
        print("{} words removed from priorwords".format(wn - len(words)))
        return words
    
def tagWords(words):
    """
    Use NLTK to tag a list of words and return a tuple (str, type)
    This function should be run before storing the words into memory so the program
    knows what part of speech the words belong and can construct sentences from those
    words accordingly
    """
    words_tags = pos_tag(words)
    print("words tagged: {}".format(words_tags))
    return words_tags

def addWordsToMemory(words, tags):
    """

    """
    # create a dict to place in the list of heard words
    # append spoken words to the running FIFO of all words
    for i in range(words):
        recent_text_q = fifoInDict(recent_text_q, words[i], tags[i], max_q_len)
        # if append to buffer according to type of grammer

    print("{} identified words: ".format(len(recent_text_q)),
            recent_text_q)
    return recent_text_q

# classify words and add them to FIFO buffers

# print current FIFO buffers

def getStrFromTuple(lst):
    r = ""
    for l in lst:
        r.join(l[0]).join(" ")
    return r

def getStrFromList(lst):
    print(lst)
    r = ""
    for i in range(len(lst)):
        print(i)
        r.join(lst[i]).join(" ")
        print(r)
    return r

def createWordDict(word_tags):
    word_dict = {}
    for word, pos in word_tags:
        if word in word_dict:
            word_dict[word]["freq"] += 1
        else:
            word_dict[word] = {"word": word, "type": pos, "freq": 1}

    consolidated_list = list(word_dict.values())
    return consolidated_list

# Testing the Speech to Text Portion of the Program
Okay great, now we have all the functions we need to detect the MacBook Pro microphone, open it,
listen for a while, and then extract the spoken text. We also have functions to remove stop words,
and tag the words with what part of speech they belong to. 

In [4]:
# keep listening until 50 words are heard and stored in memory
def listenForWords(min_words, max_words):
    new_words = []
    macbook_mic = getMacbookProMic()
    while len(new_words) < min_words:
        results = speechToText(macbook_mic, recognizer)
        if results is not []:
            new_words.extend(results)
            print("List of {} words includes: {}".format(len(new_words), new_words[:-5]))
        else:
            print("No words detected")
    return new_words[:max_words]

words = listenForWords(50, 100)
print('we found a total of {} words: {}'.format(len(words), words))

['LG HDR 4K', 'BlackHole 16ch', 'MacBook Pro Microphone', 'MacBook Pro Speakers', 'Microsoft Teams Audio', 'ZoomAudioDevice']
should be the internal microphone:  MacBook Pro Microphone
Speech Recognizer Enabled
audio exported from source
result2:
{   'alternative': [{'confidence': 0.9009071, 'transcript': 'some of them'}],
    'final': True}
<class 'str'> of tokenized words returned from google: some of them
3 words removed from stopwords
0 words removed from priorwords
List of 3 words includes: []
Speech Recognizer Enabled
audio exported from source
result2:
{   'alternative': [   {   'confidence': 0.82352448,
                           'transcript': 'that we learned of is that Taylor '
                                         'rents a number of his forces the '
                                         'production'},
                       {   'transcript': 'that we learned of is that Taylor '
                                         'rents a number of his courses the '
              

In [5]:
word_tags = tagWords(words)
recent_text_q = createWordDict(word_tags)
print(recent_text_q)

words tagged: [('some', 'DT'), ('of', 'IN'), ('them', 'PRP'), ('that', 'IN'), ('we', 'PRP'), ('learned', 'VBD'), ('of', 'IN'), ('is', 'VBZ'), ('that', 'IN'), ('Taylor', 'NNP'), ('rents', 'VBZ'), ('a', 'DT'), ('number', 'NN'), ('of', 'IN'), ('his', 'PRP$'), ('forces', 'NNS'), ('the', 'DT'), ('production', 'NN'), ('on', 'IN'), ('top', 'NN'), ('of', 'IN'), ('the', 'DT'), ('the', 'DT'), ('production', 'NN'), ('is', 'VBZ'), ('making', 'VBG'), ('cost-effective', 'JJ'), ('decisions', 'NNS'), ('who', 'WP'), ('is', 'VBZ'), ('he', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('man', 'NN'), ('who', 'WP'), ('was', 'VBD'), ('born', 'VBN'), ('in', 'IN'), ('North', 'NNP'), ('Carolina', 'NNP'), ('and', 'CC'), ('soon', 'RB'), ('after', 'IN'), ('moved', 'VBN'), ('with', 'IN'), ('his', 'PRP$'), ('family', 'NN'), ('to', 'TO'), ('rural', 'JJ'), ('Texas', 'NNP'), ('and', 'CC'), ('lived', 'VBD'), ('on', 'IN'), ('a', 'DT'), ('ranch', 'NN'), ('he', 'PRP'), ('interviews', 'VBZ'), ('that', 'IN'), ('he', 'PRP'), ('was', '

In [6]:
nouns = []
verbs = []
adjectives = []

def populateGrammarLists(recent_text_q, nouns, verbs, adjectives):
    for word in recent_text_q:
        print("word : {}".format(word))
        if word['type'].startswith("NN"):
            for i in range(word['freq']):
                nouns = fifoInLst(nouns, word['word'], max_q_len)
        elif word['type'].startswith("VB"):
            for i in range(word['freq']):
                verbs = fifoInLst(verbs, word['word'], max_q_len)
        elif word['type'].startswith("JJ"):
            for i in range(word['freq']):
                adjectives = fifoInLst(adjectives, word['word'], max_q_len)
        prompt_string = ""
    return nouns, verbs, adjectives

nounds, verbs, adjectives = populateGrammarLists(recent_text_q, nouns, verbs, adjectives)
print("{} nouns are saved: {}".format(len(nouns), nouns))
print("{} verbs are saved: {}".format(len(verbs), verbs))
print("{} adjectives are saved: {}".format(len(adjectives), adjectives))

word : {'word': 'some', 'type': 'DT', 'freq': 1}
word : {'word': 'of', 'type': 'IN', 'freq': 5}
word : {'word': 'them', 'type': 'PRP', 'freq': 1}
word : {'word': 'that', 'type': 'IN', 'freq': 3}
word : {'word': 'we', 'type': 'PRP', 'freq': 1}
word : {'word': 'learned', 'type': 'VBD', 'freq': 1}
word : {'word': 'is', 'type': 'VBZ', 'freq': 4}
word : {'word': 'Taylor', 'type': 'NNP', 'freq': 1}
word : {'word': 'rents', 'type': 'VBZ', 'freq': 1}
word : {'word': 'a', 'type': 'DT', 'freq': 5}
word : {'word': 'number', 'type': 'NN', 'freq': 1}
word : {'word': 'his', 'type': 'PRP$', 'freq': 2}
word : {'word': 'forces', 'type': 'NNS', 'freq': 1}
word : {'word': 'the', 'type': 'DT', 'freq': 3}
word : {'word': 'production', 'type': 'NN', 'freq': 2}
word : {'word': 'on', 'type': 'IN', 'freq': 2}
word : {'word': 'top', 'type': 'NN', 'freq': 1}
word : {'word': 'making', 'type': 'VBG', 'freq': 1}
word : {'word': 'cost-effective', 'type': 'JJ', 'freq': 1}
word : {'word': 'decisions', 'type': 'NNS', '

## Text to Speech using espeak in OS

In [7]:
import os
import random

# okay, now it is time to construct our string

def addSimplePhraise():
    # generate multiple phraises, then determine best one using nltk
    return "{} {} {} {} {}".format(randomNoun(), randomAdj(), randomNoun(), randomVerb(), randomNoun())

def randomNoun():
    return nouns[random.randint(0, len(nouns) - 1)]

def randomVerb():
    return verbs[random.randint(0, len(verbs) - 1)]

def randomAdj():
    return adjectives[random.randint(0, len(adjectives) - 1)]

prompt_string = "{}, {}, {}".format(addSimplePhraise(), addSimplePhraise(), addSimplePhraise())
print(prompt_string)


Texas ne'er-do-well production is decisions, ranch cost-effective decisions rents production, forces rural bit learned top


In [8]:
gender = 'm' # as oppose to  'f'
vnum = str(random.randint(1, 5))
voice = '{}{}'.format(gender, vnum)
pitch = str(random.randint(10, 90))
command = 'espeak -s 180 -v {} -p {} "{}"\n'.format(voice, pitch, prompt_string)
print("using espeak to say the following command {}".format(prompt_string))
os.system(command)

using espeak to say the following command Texas ne'er-do-well production is decisions, ranch cost-effective decisions rents production, forces rural bit learned top


0