# Speech Libraries

## Librosa

In [None]:
!pip install librosa

In [None]:
# 1. read .wav 
import librosa

filename = librosa.example('nutcracker')
# Load the audio as a waveform `y`
# Store the sampling rate as `sr`
audio, sr = librosa.load(filename)


import numpy as np
import matplotlib.pyplot as plt
time = np.linspace(0, audio.shape[0] / sr, num=audio.shape[0]) 
plt.plot(time, audio, color="blue")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude (quantized)")
plt.title("Wav file visualization")
plt.axhline(y=0, color='r', linestyle='-')
plt.show()

In [None]:
# 2. mel spectrogram
import librosa
filename = librosa.example('nutcracker')
# Load the audio as a waveform `y`
# Store the sampling rate as `sr`
audio, sr = librosa.load(filename)
S = librosa.feature.melspectrogram(y=audio, sr=sr)

import matplotlib.pyplot as plt
import librosa.display
fig, ax = plt.subplots()
S_dB = librosa.power_to_db(S, ref=np.max)
img = librosa.display.specshow(S_dB, x_axis='time',
                         y_axis='mel', sr=sr,
                         fmax=8000, ax=ax)
fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set(title='Mel-frequency spectrogram')

## Scipy

In [None]:
!pip install scipy

In [None]:
# 3. read wav 
from scipy.io import wavfile

wav_filepath = 'data.wav'
sampling_rate, data = wavfile.read(wav_filepath) 


import numpy as np
import matplotlib.pyplot as plt
time = np.linspace(0, data.shape[0] / sampling_rate, num=data.shape[0]) 

plt.plot(time, data, color="green")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude (quantized)")
plt.title("Wav file visualization")
plt.axhline(y=0, color='orange', linestyle='-')
plt.show()

## SpeechRecognition

In [None]:
!pip install SpeechRecognition

In [1]:
# automatic speech recognition
import speech_recognition as sr

r = sr.Recognizer()
audio_filename = 'data.wav'


my_audio = sr.AudioFile(audio_filename)
with my_audio as source:
    audio = r.record(source)

print(type(audio))
your_speech = r.recognize_google(audio, language="vi-VN")
print("Audio transcription: ", your_speech)

<class 'speech_recognition.AudioData'>
Audio transcription:  hiện ra tận phía xa


## gtts

In [None]:
!pip install gtts

In [None]:
# 4. text to speech (colab)
from gtts import gTTS
import librosa
import numpy as np

lang='vi'
output_filename = 'record.mp3'
content = "xin chào mọi người"
output = gTTS(content, lang=lang, slow=False) # text to speech

output.save(output_filename) # save google audio to a file
data, sr = librosa.load(output_filename) # load google audio using librosa library

import IPython
IPython.display.display(IPython.display.Audio(np.transpose(data), rate=sr)) # display audio 