In [1]:
from vosk import Model, KaldiRecognizer, SetLogLevel
from typing import List, Set, Dict, Tuple
import sys
import os, json
import wave
import subprocess
from pydub import AudioSegment

SetLogLevel(0)

In [54]:
if not os.path.exists("model"):
    print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
    exit (1)

sample_rate=16000
model = Model("model")
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)

# converts audio file to appropriate format (ffmpeg needs to be installed)
process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i',
                            "audio/short_speech.wav",
                            '-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'],
                            stdout=subprocess.PIPE)

json_array = []

while True:
    data = process.stdout.read(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        res = json.loads(rec.Result())
        if 'result' in res:
            json_array.append(res)
    else:
        pass

with open('results.json', 'w') as json_res:
    json.dump(json_array, json_res)


[{'result': [{'conf': 1.0, 'end': 0.75, 'start': 0.06, 'word': 'people'}, {'conf': 1.0, 'end': 1.74, 'start': 1.32, 'word': 'older'}, {'conf': 1.0, 'end': 1.98, 'start': 1.74, 'word': 'than'}, {'conf': 1.0, 'end': 2.16, 'start': 1.98, 'word': 'me'}, {'conf': 1.0, 'end': 2.31, 'start': 2.16, 'word': 'as'}, {'conf': 1.0, 'end': 2.52, 'start': 2.31, 'word': 'much'}, {'conf': 1.0, 'end': 2.64, 'start': 2.52, 'word': 'as'}, {'conf': 1.0, 'end': 3.27, 'start': 2.64, 'word': 'possible'}], 'text': 'people older than me as much as possible'}, {'result': [{'conf': 1.0, 'end': 10.26, 'start': 9.99, 'word': 'always'}, {'conf': 1.0, 'end': 10.53, 'start': 10.26, 'word': 'good'}, {'conf': 1.0, 'end': 10.8, 'start': 10.53, 'word': 'for'}, {'conf': 0.792936, 'end': 11.1, 'start': 10.8, 'word': 'a'}, {'conf': 0.604535, 'end': 11.310753, 'start': 11.246321, 'word': 'or'}, {'conf': 1.0, 'end': 11.385457, 'start': 11.310753, 'word': 'a'}, {'conf': 1.0, 'end': 11.79, 'start': 11.46, 'word': 'pep'}, {'conf'

In [2]:
def word_statistics(path_to_json: str) -> dict:
    """
    returns a dict that contains every spoken word and its count.
    """
    counts = dict()
    sorted_dict = {}
    
    with open(path_to_json, 'r') as json_res:
        json_array = json.load(json_res)
    for item in json_array:
        if 'result' in item:
            for word_stat in item['result']:
                if word_stat['word'] in counts:
                    counts[word_stat['word']] += 1
                else:
                    counts[word_stat['word']] = 1
    # sort by word count
    sorted_keys = sorted(counts, key=counts.get)  
    for w in sorted_keys:
        sorted_dict[w] = counts[w]
    
    return sorted_dict
                    
def get_word_occurrences(word_stat: dict, path_to_json: str) -> List[Dict[str, float]]:
    """
    returns a list of the occurrences (in sec) of all spoken words. Last item in list is the most spoken word.
    """
    occurrences = []
    
    with open(path_to_json, 'r') as json_res:
        json_array = json.load(json_res)
    for word in word_stat:
        occurrences.append({word: word_stat[word], 'occurrences': []})
        for result in json_array:
            for occurrence in result['result']:
                if occurrence['word'] == word:
                    occurrences[len(occurrences) - 1]['occurrences'].append((occurrence['start'], occurrence['end']))
    return occurrences
    
word_stat = word_statistics("results.json")
word_occurrences = get_word_occurrences(word_stat, "results.json")

In [107]:
def segment_words_from_audio(audioFileName: str, word_occurrences: dict):
    audio = AudioSegment.from_wav(audioFileName)
    occurrences = word_occurrences['occurrences']
    segment = audio[occurrences[0][0] * 1000 : occurrences[0][1] * 1000]
    for i in range(1, len(occurrences)):
        segment = segment + audio[occurrences[i][0] * 1000 : occurrences[i][1] * 1000]
    segment.export(f'{audioFileName[:-4]}_word-{next(iter(word_occurrences)).upper()}.wav', format='wav')

segment_words_from_audio("audio/short_speech.wav", word_occurrences[len(word_occurrences) - 1])
print(word_occurrences)

[{'older': 1, 'occurrences': [(1.32, 1.74)]}, {'than': 1, 'occurrences': [(1.74, 1.98)]}, {'much': 1, 'occurrences': [(2.31, 2.52)]}, {'possible': 1, 'occurrences': [(2.64, 3.27)]}, {'for': 1, 'occurrences': [(10.53, 10.8)]}, {'or': 1, 'occurrences': [(11.246321, 11.310753)]}, {'pep': 1, 'occurrences': [(11.46, 11.79)]}, {'talk': 1, 'occurrences': [(11.79, 12.36)]}, {'but': 1, 'occurrences': [(12.36, 13.17)]}, {'with': 1, 'occurrences': [(13.26, 13.41)]}, {'dose': 1, 'occurrences': [(13.5, 13.92)]}, {'reality': 1, 'occurrences': [(14.04, 14.64)]}, {'he': 1, 'occurrences': [(14.64, 14.73)]}, {"doesn't": 1, 'occurrences': [(14.73, 14.97)]}, {'sugarcoat': 1, 'occurrences': [(14.97, 15.57)]}, {'anything': 1, 'occurrences': [(15.57, 16.08)]}, {'so': 1, 'occurrences': [(16.08, 16.26)]}, {'talking': 1, 'occurrences': [(16.262523, 16.65)]}, {'him': 1, 'occurrences': [(16.71, 16.89)]}, {'helps': 1, 'occurrences': [(17.19, 17.43)]}, {'give': 1, 'occurrences': [(17.43, 17.61)]}, {'perspective': 1