# Model predictions
+ Speech-to-Text
+ GUI implementation
+ Audio file conversion

In [1]:
import joblib
import keras
import speech_recognition as sr
from pydub import AudioSegment
import librosa
import numpy as np
import pandas as pd
import librosa.display
import simpleaudio as sa
import pickle

## Load in models, encoders and other objects needed

In [2]:
# Load models
loaded_clf = joblib.load("Models/TextModel.joblib")
loaded_keras = keras.models.load_model('Models/AudioModel')
dkeras = keras.models.load_model('Models/AudioModel')
gkeras = keras.models.load_model('Models/GenderedAudioModel')

# Load vectorizer for text model
file_to_read = open("Objects/vectorizer.obj", "rb")
vect = pickle.load(file_to_read)

# Load OneHotEncoder for audio signal model
file_to_read2 = open("Objects/encoder.obj", "rb")
enc = pickle.load(file_to_read2)
denc = enc

file_to_read3 = open("Objects/encoderGendered.obj", "rb")
genderenc = pickle.load(file_to_read3)

### Go through the list of files and convert them all to .wav

In [12]:
def convert_to_wav():
    import os
    directory = "./TestAudio/"
    for filename in os.listdir(directory):
        filetype = filename.split(".")[1]
        fullfile = directory + filename
        if filetype != "wav":
            try:
                output = (directory + filename.split(".")[0] + ".wav")
                audSeg = AudioSegment.from_file(fullfile)
                audSeg.export(output, format="wav")
                os.remove(fullfile)
            except:
                {}

### Define functions for both models predictor logic

In [4]:
def predict_signal(filename, loaded_keras, enc):
    #Extract MFCC's
    y, sr = librosa.load(filename, duration=6, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    feature = pd.DataFrame(data=mfcc)
    feature = feature.stack().to_frame().T
    twodim = np.expand_dims(feature, axis=2)
    pred = loaded_keras.predict(twodim, batch_size=32, verbose=1)
    
    #Predict using keras audio model
    prediction = (enc.inverse_transform(pred))
    return prediction[0][0]

In [5]:
def predict_text(filename):
    r = sr.Recognizer()
    #Audio to Text
    with sr.AudioFile(filename) as source:
        # listen for the data (load audio to memory)
        audio_data = r.record(source)
        # recognize (convert from speech to text)
        text = r.recognize_google(audio_data)

    #Predict using random forest model
    testfeat = vect.transform([text])
    prediction = loaded_clf.predict(testfeat)
    return prediction[0], text

## Other functions needed

In [6]:
def getSoundDuration(path):
    sound = AudioSegment.from_file(path)
    sound.duration_seconds == (len(sound) / 1000.0)
    minutes_duration = int(sound.duration_seconds // 60)
    seconds_duration = round((sound.duration_seconds % 60))
    return minutes_duration, seconds_duration

In [7]:
def switch_model(boolean):
    if (boolean==True):
        return gkeras, genderenc
    else:
        return dkeras, denc

In [8]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

def waveplot(path):
    data, sr = librosa.load(path)
    plt.title("Sound Wave")
    plt.ylabel('dB')
    librosa.display.waveplot(data, sr)
    return plt.gcf()

### GUI (PySimpleGUI)

In [13]:
import PySimpleGUI as sg
import os.path

boolean = True
abspath = os.path.abspath("./TestAudio/")
currentFile = ""
play_obj = None

def get_list():
    full_list = os.listdir(abspath)
    wav_list = []
    for f in full_list:
        if f.split(".")[1] == "wav":
            mins, secs = getSoundDuration(abspath+"//"+f)
            if mins==0 and secs < 12 and secs > 1:
                wav_list.append(f)
    return wav_list

def draw_figure(canvas, figure):
    figure_canvas_agg = FigureCanvasTkAgg(figure, canvas)
    figure_canvas_agg.draw()
    figure_canvas_agg.get_tk_widget().pack(side='top', fill='both', expand=1)
    plt.close()
    return figure_canvas_agg

def delete_figure(figure_canvas_agg):
    figure_canvas_agg.get_tk_widget().forget()
    plt.close('all')

refresh_tooltip = "After adding or removing a file from the path, click here to refresh the list.\n\u2022 If the file added is an M4A, MP4 or other compatible audio file, it will also be converted to .wav first \n\u2022 If the file is not compatible and/or can't be converted, it won't be displayed in the list\n \u2022 Compatible audio files must be between 1 and 13 seconds in duration"

left_column = [
    [
        sg.Text("Speech file path: "),
        sg.Text(abspath),
        sg.Button("Go to the path", key="PathButton", tooltip="Add or remove audio files to process"),
        sg.Button("Refresh", key="Refresh", tooltip=refresh_tooltip)
    ],
    [
        sg.Listbox(
        values=get_list(), enable_events=True, size=(40,20), key="FileList"
        )
    ],
    [
        sg.Button("Switch model to gendered model", key="Switch", tooltip="Model will change the next time a file is chosen for processing"),
    ]
]

mid_column = [
    [sg.Text("Choose a file from the list")],
    [sg.Button("Play"), sg.Button("Stop")],
    [sg.Text("Words in the audio file: ")],
    [sg.Text(key="SpeechText")],
    [sg.Text("Prediction based on audio signals:")],
    [sg.Text(key="AudioPred")],
    [sg.Text("Prediction based on words spoken:")],
    [sg.Text(key="WordsPred")],
    [sg.Text(key="gap")],
    [sg.Text(key="Error")],
    [sg.Text(key="Error2")],
]

right_column = [
    [sg.Text("Select an audio file to display the graph...", key="GraphText")],
    [sg.Canvas(key='Canvas')],
]

layout = [
    [
        sg.Column(left_column),
        sg.VSeparator(),
        sg.Column(mid_column),
        sg.Column(right_column),
    ]
]

window = sg.Window(title="Speech emotion predictor", layout=layout, margins=(100,100), finalize=True,
                   icon="icon.ico")

figure_canvas_agg = None

while True:
    event, values = window.read()
    
    if event == "PathButton":
        os.startfile(abspath)
    
    if event == "Refresh":
        convert_to_wav()
        window["FileList"].update(get_list())
    
    if event == "Switch":
        loaded_keras, enc = switch_model(boolean)
        if (boolean):
            boolean = False
            window["Switch"].update("Switch model to default model")
        else: 
            boolean = True
            window["Switch"].update("Switch model to gendered model")
            
    if event == "Play":
        try:
            if play_obj != None:
                play_obj.stop()
            wave_obj = sa.WaveObject.from_wave_file(filename)
            play_obj = wave_obj.play()
        except Exception as e:
            window["Error"].update("Error with playback:", text_color="red", font="bold", visible=True)
            window["Error2"].update(str(e), text_color="red", font="bold", visible=True)
        
    if event == "Stop":
        try:
            play_obj.stop()
        except:
            {}
    
    
    if event == "FileList":
        try:
            window["Error"].update(visible=False)
            window["Error2"].update(visible=False)
            pred_signal_out = ""
            currentFile = filename = os.path.join(abspath, values["FileList"][0])
            pred_signal_out = predict_signal(filename, loaded_keras, enc)
            pred_text_out, text_to_predict = predict_text(filename)
                      
            window["SpeechText"].update(text_to_predict, text_color="pink")
            window["AudioPred"].update(pred_signal_out, text_color="pink")
            window["WordsPred"].update(pred_text_out, text_color="pink")
            
            if figure_canvas_agg:
                delete_figure(figure_canvas_agg)
            
            fig = waveplot(filename)
            figure_canvas_agg = draw_figure(window['Canvas'].TKCanvas, fig)
            window["GraphText"].update("Graph of " + filename.split("\\")[-1])     
        except:
            error = "Error with this file"
            if figure_canvas_agg:
                delete_figure(figure_canvas_agg)
            figure_canvas_agg = draw_figure(window['Canvas'].TKCanvas, None)
            window["SpeechText"].update(error, text_color="pink")
            window["AudioPred"].update(error, text_color="pink")
            window["WordsPred"].update(error, text_color="pink")
            window["GraphText"].update("Error with " + filename.split("\\")[-1])     

    if event == sg.WIN_CLOSED:
        break
window.close()

