-
Notifications
You must be signed in to change notification settings - Fork 3
/
cmu_arctic.py
executable file
·221 lines (178 loc) · 7.4 KB
/
cmu_arctic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
import audio
from nnmnkwii.datasets import cmu_arctic
from nnmnkwii.io import hts
from nnmnkwii import preprocessing as P
from hparams import hparams
from os.path import exists
import librosa
import scipy.signal as dsp
from scipy.linalg import solve_toeplitz as lpcsolve
from wavenet_vocoder.util import is_mulaw_quantize, is_mulaw, is_raw
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
speakers = cmu_arctic.available_speakers
wd = cmu_arctic.WavFileDataSource(in_dir, speakers=speakers)
wav_paths = wd.collect_files()
speaker_ids = wd.labels
for index, (speaker_id, wav_path) in enumerate(
zip(speaker_ids, wav_paths)):
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, "N/A")))
return [future.result() for future in tqdm(futures)]
def start_at(labels):
has_silence = labels[0][-1] == "pau"
if not has_silence:
return labels[0][0]
for i in range(1, len(labels)):
if labels[i][-1] != "pau":
return labels[i][0]
assert False
def end_at(labels):
has_silence = labels[-1][-1] == "pau"
if not has_silence:
return labels[-1][1]
for i in range(len(labels) - 2, 0, -1):
if labels[i][-1] != "pau":
return labels[i][1]
assert False
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
fs = hparams.sample_rate
# Load the audio to a numpy array. Resampled if needed
wav = audio.load_wav(wav_path)
lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")
# Trim silence from hts labels if available
# TODO
if exists(lab_path) and False:
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav, _ = librosa.effects.trim(wav, top_db=20)
else:
wav, _ = librosa.effects.trim(wav, top_db=20)
if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
out = wav
constant_values = 0.0
out_dtype = np.float32
p_vt=5
p_gl=5
d=0.99
hpfilt=2
preflt = p_vt+1
# High-pass filter speech in order to remove possible low frequency
# fluctuations (Linear-phase FIR, Fc = 70 Hz)
Fstop = 40 # Stopband Frequency
Fpass = 70 # Passband Frequency
Nfir = np.round(300/16000*fs) # FIR numerator order
if (Nfir % 2==0):
Nfir = Nfir + 1
'''
it is very very expensive to calculate the firls filter! However, as
long as the fs does not change, the firls filter does not change.
Therefore, the computed filter is returned and can be passed to this
function later on to avoid the calculated of the (same) filter.
'''
B = dsp.firls(Nfir, [0, Fstop/(fs/2), Fpass/(fs/2), fs/2], [0, 0, 1, 1], [1, 1])
'''
% Estimate the combined effect of the glottal flow and the lip radiation
% (Hg1) and cancel it out through inverse filtering. Note that before
% filtering, a mean-normalized pre-frame ramp is appended in order to
% diminish ripple in the beginning of the frame. The ramp is removed after
% filtering.
'''
le=np.int(len(wav)/hparams.hop_size)
glot=np.zeros([le, 254])
vtfilter=np.zeros([le, 5])
for j in range(le):
w = wav[hparams.hop_size*(j):hparams.hop_size*(j+1)]
#print(wav[(hparams.hop_size)*(i):(hparams.hop_size)*(i+1)])
for i in range(hpfilt):
w = dsp.lfilter(B, 1, np.concatenate((w, np.zeros(np.int(len(B)/2)-1))))
w = w[np.int(len(B)/2):]
win = np.hanning(len(w))
a=np.array([1])
b=np.array([1, -d])
signal = np.concatenate((np.linspace(-w[0],w[0],preflt), w))
windowed = w*win
t=dsp.correlate(windowed,windowed)
l=len(win)
Hg1 = lpcsolve(t[l:l+1],t[l+1:l+2]);
y = dsp.lfilter(Hg1,1,signal);
y = y[preflt:];
'''
% Estimate the effect of the vocal tract (Hvt1) and cancel it out through
% inverse filtering. The effect of the lip radiation is canceled through
% intergration. Signal g1 is the first estimate of the glottal flow.
'''
windowedy = y*win
r=dsp.correlate(windowedy,windowedy)
Hvt1 = lpcsolve(r[l:l+p_vt],r[l+1:l+p_vt+1]);
g1 = dsp.lfilter(Hvt1,1,signal);
g1 = dsp.lfilter(a, b, g1);
g1 = g1[preflt:];
'''
% Re-estimate the effect of the glottal flow (Hg2). Cancel the contribution
% of the glottis and the lip radiation through inverse filtering and
% integration, respectively.
'''
windowedg1 = w*g1
u=dsp.correlate(windowedg1,windowedg1)
Hg2 = lpcsolve(u[l:l+p_gl],u[l+1:l+p_gl+1]);
y = dsp.lfilter(Hg2,1,signal);
y = dsp.lfilter(a, b, y);
y = y[preflt:];
'''
% Estimate the model for the vocal tract (Hvt2) and cancel it out through
% inverse filtering. The final estimate of the glottal flow is obtained
% through canceling the effect of the lip radiation.
'''
windowedynew = y*win
t=dsp.correlate(windowedynew,windowedynew)
Hvt2 = lpcsolve(t[l:l+p_vt],t[l+1:l+p_vt+1]);
dg = dsp.lfilter(Hvt2,1,signal);
g = dsp.lfilter(a, b, dg);
g = g[preflt:];
dg = dg[preflt:];
# Set vocal tract model to 'a' and glottal source spectral model to 'ag'
a = Hvt2;
ag = Hg2;
#print(j)
glot[j-1]=g.T
vtfilter[j-1]=a.T
# Compute a mel-scale spectrogram from the trimmed wav:
# (N, D)
mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
# lws pads zeros internally before performing stft
# this is needed to adjust time resolution between audio and mel-spectrogram
l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
# zero pad for quantized signal
out = np.pad(out, (l, r), mode="constant", constant_values=constant_values)
N = mel_spectrogram.shape[0]
assert len(out) >= N * audio.get_hop_size()
# time resolution adjustment
# ensure length of raw audio is multiple of hop_size so that we can use
# transposed convolution to upsample
out = out[:N * audio.get_hop_size()]
assert len(out) % audio.get_hop_size() == 0
timesteps = len(out)
# Write the spectrograms to disk:
audio_filename = 'cmu_arctic-audio-%05d.npy' % index
glot_filename = 'cmu_arctic-glot-%05d.npy' % index
vt_filename = 'cmu_arctic-vt-%05d.npy' % index
mel_filename = 'cmu_arctic-mel-%05d.npy' % index
np.save(os.path.join(out_dir, audio_filename),
out.astype(out_dtype), allow_pickle=False)
np.save(os.path.join(out_dir, glot_filename),
glot.astype(out_dtype), allow_pickle=False)
np.save(os.path.join(out_dir, vt_filename),
vtfilter.astype(out_dtype), allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename),
mel_spectrogram.astype(np.float32), allow_pickle=False)
# Return a tuple describing this training example:
return (audio_filename, mel_filename, glot_filename, vt_filename, timesteps, text, speaker_id)