-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_waveform_from_code.py
124 lines (106 loc) · 3.74 KB
/
generate_waveform_from_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import json
import logging
from pathlib import Path
import random
import soundfile as sf
import torch
from tqdm import tqdm
from fairseq import utils
from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder
# from examples.speech_to_speech.generate_waveform_from_code import cli_main
from examples.speech_to_speech.preprocessing.data_utils import process_units
import os, glob
import numpy as np
logging.basicConfig()
logging.root.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def dump_result(args, unit_path, pred_wav, suffix=""):
assert args.in_code_file in unit_path
out_path = os.path.splitext(unit_path.replace(args.in_code_file, args.results_path))[0]+'.wav'
# if not os.path.exists(out_path):
os.makedirs(os.path.dirname(out_path), exist_ok=True)
sf.write(
out_path,
pred_wav.detach().cpu().numpy(),
16000,
)
def load_code(in_file):
unit_paths = glob.glob(f"{in_file}/**/*.unit", recursive=True)
for unit_path in unit_paths:
unit = torch.load(unit_path)
unit = process_units(unit, reduce=True)
yield unit_path, unit
def main(args):
logger.info(args)
use_cuda = torch.cuda.is_available() and not args.cpu
with open(args.vocoder_cfg) as f:
vocoder_cfg = json.load(f)
vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg)
if use_cuda:
vocoder = vocoder.cuda()
multispkr = vocoder.model.multispkr
if multispkr:
logger.info("multi-speaker vocoder")
num_speakers = vocoder_cfg.get(
"num_speakers", 200
) # following the default in codehifigan to set to 200
assert (
args.speaker_id < num_speakers
), f"invalid --speaker-id ({args.speaker_id}) with total #speakers = {num_speakers}"
data = load_code(args.in_code_file)
Path(args.results_path).mkdir(exist_ok=True, parents=True)
# for i, d in tqdm(enumerate(data), total=len(data)):
for d_path, d in tqdm(data):
x = {
"code": torch.LongTensor(d).view(1, -1),
}
suffix = ""
if multispkr:
spk = (
random.randint(0, num_speakers - 1)
if args.speaker_id == -1
else args.speaker_id
)
suffix = f"_spk{spk}"
x["spkr"] = torch.LongTensor([spk]).view(1, 1)
x = utils.move_to_cuda(x) if use_cuda else x
wav = vocoder(x, args.dur_prediction)
dump_result(args, d_path, wav, suffix=suffix)
# assert 1==0
def cli_main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--in-code-file", type=str, required=True, help="one unit sequence per line"
)
parser.add_argument(
"--vocoder", type=str, required=True, help="path to the CodeHiFiGAN vocoder"
)
parser.add_argument(
"--vocoder-cfg",
type=str,
required=True,
help="path to the CodeHiFiGAN vocoder config",
)
parser.add_argument("--results-path", type=str, required=True)
parser.add_argument(
"--dur-prediction",
action="store_true",
help="enable duration prediction (for reduced/unique code sequences)",
)
parser.add_argument(
"--speaker-id",
type=int,
default=-1,
help="Speaker id (for vocoder that supports multispeaker). Set to -1 to randomly sample speakers.",
)
parser.add_argument("--cpu", action="store_true", help="run on CPU")
args = parser.parse_args()
main(args)
if __name__ == "__main__":
cli_main()