In [None]:
from IPython.display import Audio
filename = "data/KsponSpeech_E00001.wav"
Audio(filename, autoplay=False)

In [None]:
import librosa
samples, sampling_rate = librosa.load(filename, sr=None)
len(samples), sampling_rate

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib inline
plt.rcParams["figure.figsize"]=12,5
from librosa import display
import numpy as np
plt.figure()
librosa.display.waveshow(y = samples, sr = sampling_rate)
plt.show()

In [None]:
hop_length = 160 
n_fft = 320

stft = librosa.stft(samples, n_fft=n_fft, hop_length=hop_length)
spectrogram = np.abs(stft)**2
log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

plt.figure()
librosa.display.specshow(log_spectrogram, sr=sampling_rate, hop_length=hop_length, y_axis="hz", x_axis="time")


In [None]:
hop_length = 160 
n_fft = 320
n_mels = 40

S = librosa.feature.melspectrogram(y=samples, sr=sampling_rate, hop_length=hop_length, n_fft=n_fft, n_mels = n_mels, fmin=0.0, fmax=None)
S_dB = librosa.power_to_db(S, ref=np.max)
dim_feature, len_feature = S_dB.shape
fig = plt.figure()
librosa.display.specshow(S_dB, y_axis=None, x_axis=None)
fig.gca().set_yticks(range(0, dim_feature+1, 10))
fig.gca().set_xticks(range(0, len_feature, 100))
fig.gca().set_ylabel("Mel-freq. Index")
fig.gca().set_xlabel("Frame Index")


In [None]:
from espnet2.asr.specaug.specaug import SpecAug
import torch

specaug = SpecAug(apply_time_warp=True,
                  time_warp_window=100,
                  time_mask_width_range=(0,40),
                  freq_mask_width_range=(0,3),
                  apply_freq_mask=True,
                  apply_time_mask=True)
print(specaug)

S_dB_in = torch.unsqueeze(torch.from_numpy(S_dB.copy().T), 0)
specaug(S_dB_in)
S_dB_out = torch.squeeze(S_dB_in, 0).numpy().T

plt.figure()
librosa.display.specshow(S_dB_out, sr=sampling_rate, hop_length=hop_length, y_axis="mel", x_axis="time")

