In [1]:
import os

import librosa
import numpy as np
from scipy.io import wavfile
from tqdm import tqdm

from text import _clean_text


def prepare_align():
    wav_dir = '/data/ddsp_data/LibriTTS_R/wavs'
    raw_text_dir = '/data/ddsp_data/LibriTTS_R/txts'
    out_dir = '/data/nzxyin/LibriTTS_R_preprocessed'
    sampling_rate = 16000
    max_wav_value = 32768.0
    cleaners = ["english_cleaners"]
    all_filenames = {filename[:-4] for filename in os.listdir(wav_dir) if filename[-4:] == '.wav'} & {filename[:-4] for filename in os.listdir(raw_text_dir) if filename[-4:] == '.txt'}
    for filename in tqdm(all_filenames):
        text_path = os.path.join(raw_text_dir, f"{filename}.txt")
        wav_path = os.path.join(wav_dir, f"{filename}.wav")
        speaker = filename.split('_')[0]
        with open(text_path) as f:
            text = f.readline().strip("\n")
        text = _clean_text(text, cleaners)

        os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
        wav, _ = librosa.load(wav_path, sampling_rate)
        wav = wav / max(abs(wav)) * max_wav_value
        wavfile.write(
            os.path.join(out_dir, speaker, f"{filename}.wav"),
            sampling_rate,
            wav.astype(np.int16),
        )
        with open(
            os.path.join(out_dir, speaker, f"{filename}.lab"),
            "w",
        ) as f1:
            f1.write(text)

In [2]:
prepare_align()

  0%|          | 0/373331 [00:01<?, ?it/s]


TypeError: load() takes 1 positional argument but 2 were given

In [14]:
import numpy as np
from tqdm import tqdm
import librosa
import os
wav_dir = '/data/ddsp_data/LibriTTS_R/wavs'
max_val = -float('inf')
min_val = float('inf')
for filename in os.listdir(wav_dir)[:10]:
    wav, _ = librosa.load(os.path.join(wav_dir, filename), sr=16000)
    max_val = max(max(wav), max_val)
    min_val = min(min(wav), min_val)

In [1]:
print(max_val)
print(min_val)

NameError: name 'max_val' is not defined

In [6]:
import os
import random
import json

import tgt
import librosa
import numpy as np
import pyworld as pw
from scipy.interpolate import interp1d
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

import audio as Audio

def process_utterance(basename):
    text_path = os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA/wavs_txts', basename.split('_')[0], "{}.lab".format(basename))
    tg_path = os.path.join(
        '/data/nzxyin/LibriTTS_R_aligned_ARPA/textgrids', basename.split('_')[0], f"{basename}.TextGrid"
    )

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration = get_alignment(
        textgrid
    )
    text = "{" + " ".join(phone) + "}"
    # if start >= end:
    #     return None
    
    # Read raw text
    with open(text_path, "r") as f:
        raw_text = f.readline().strip("\n")

    # Save files
    dur_filename = f"{basename}.npy"
    np.save(os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA/duration', dur_filename), duration)

    return "|".join([basename, text, raw_text])
    

def get_alignment(textgrid):
    
    tier = textgrid.get_tier_by_name("phones")
    # sil_phones = ["sil", "sp", "spn"]

    phones = []
    durations = []
    s, e, p = None, None, None
    for i, t in enumerate(tier._objects):
        s, e, p = t.start_time, t.end_time, t.text
        if i == 0:
            phones.append("sil")
            durations.append(int(np.ceil(s * 16000 / 80)))
        # Trim leading silences
        # if phones == []:
        #     if p in sil_phones:
        #         continue
        #     else:
        start_time = s

        # if p not in sil_phones:
        #     # For ordinary phones
        phones.append(p)
        # end_time = e
        # end_idx = len(phones)
        # else:
        #     # For silent phones
        #     phones.append(p)

        durations.append(
            int(np.ceil((e - s) * 16000 / 80))
        )

    # Trim tailing silences
    # phones = phones[:end_idx]
    # durations = durations[:end_idx]
    phones.append("sil")
    durations.append(int(np.ceil((textgrid.end_time - e) * 16000 / 80)))

    return phones, durations

  from .autonotebook import tqdm as notebook_tqdm


In [128]:
def build():
    # os.makedirs((os.path.join(self.out_dir, "mel")), exist_ok=True)
    # os.makedirs((os.path.join(self.out_dir, "pitch")), exist_ok=True)
    # os.makedirs((os.path.join(self.out_dir, "energy")), exist_ok=True)
    os.makedirs((os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA', "duration")), exist_ok=True)
    os.makedirs((os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA', "splits")), exist_ok=True)

    print("Processing Data ...")
    out = list()

    speakers = {}
    for i, speaker in enumerate(tqdm(os.listdir('/data/nzxyin/LibriTTS_R_aligned_ARPA/textgrids'))):
        speakers[speaker] = i
        for gridname in os.listdir(os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA/textgrids', speaker)):

            basename = gridname.split(".")[0]
            tg_path = os.path.join(
                '/data/nzxyin/LibriTTS_R_aligned_ARPA/textgrids', speaker, f"{basename}.TextGrid"
            )
            if os.path.exists(tg_path):
                ret = process_utterance(basename)
                if ret is None:
                    continue
                else:
                    info = ret
                out.append(info)
    
    with open(os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA', "splits", "speakers.json"), "w") as f:
        f.write(json.dumps(speakers))

    random.shuffle(out)
    out = [r for r in out if r is not None]

    total = len(out)
    train = out[:int(np.round(0.90*total))]
    val = out[int(np.round(0.90*total)):int(np.round(0.95*total))]
    test = out[int(np.round(0.95*total)):]

    # Write metadata
    with open(os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA', "splits", "train.txt"), "w", encoding="utf-8") as f:
        for m in train:
            f.write(m + "\n")
    with open(os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA', "splits", "val.txt"), "w", encoding="utf-8") as f:
        for m in val:
            f.write(m + "\n")

    with open(os.path.join('/data/nzxyin/LibriTTS_R_aligned_ARPA', "splits", "test.txt"), "w", encoding="utf-8") as f:
        for m in test:
            f.write(m + "\n")

    return out

In [129]:
build()

Processing Data ...


100%|██████████| 2456/2456 [04:14<00:00,  9.65it/s]


['1171_134968_000020_000002|{sil B L EH1 S M IY1 S EH1 D DH IY0 OW1 L D W UH1 M AH0 N EH1 Z W AH1 N TH IH1 NG AE1 F T ER0 AH0 N AH1 DH ER0 AH0 P IH1 R D AE1 N D SH IY1 W AO1 N T IH0 D T IH0 N OW1 W EH1 R HH ER0 HH AH1 Z B AH0 N D HH AE1 D G AA1 T DH AH0 M IH1 L F R AH1 M B AH1 T HH IY1 W UH1 D N AA1 T T EH1 L HH ER1 DH AE1 T sil}|"bless me!" said the old woman as one thing after another appeared; and she wanted to know where her husband had got the mill from, but he would not tell her that.',
 '8183_118128_000006_000009|{sil IH1 T W AH0 Z K W AY1 T M AE1 N AH0 F EH2 S T DH AH0 T SH IY1 HH AE1 D N AA1 T L EH1 T AH0 S IH1 NG G AH0 L AA2 P ER0 T UW1 N AH0 T IY0 S L AY1 D B AH1 T S IY1 Z D DH AH0 F ER1 S T CH AE1 N S AE1 N D ER0 AY1 V D P AA1 R T L IY0 B AY1 M IY1 N Z AH1 V DH IY0 AH1 N D ER0 G R AW2 N D R EY1 L R OW1 D AE1 N D P AA1 R T L IY0 B AY1 DH AH0 R EH1 G Y AH0 L ER0 T R EY1 N M EH1 N IY0 D IH1 F AH0 K AH0 L T IY0 Z W ER0 S ER0 M AW1 N T IH0 D B IH0 F AO1 R AE1 N D AE1 F T ER0 L I

In [11]:
textgrid = tgt.io.read_textgrid("/data/nzxyin/LibriTTS_R_aligned_ARPA/textgrids/7672/7672_96509_000005_000001.TextGrid")
print(textgrid.get_tier_by_name('phones'))
durations = np.load("/data/nzxyin/LibriTTS_R_aligned_ARPA/duration/7672_96509_000005_000001.npy")
print(sum(durations))
durations

IntervalTier(start_time=0.0, end_time=5.64, name="phones", objects=[Interval(0.05, 0.13, "N"), Interval(0.13, 0.25, "AA1"), Interval(0.25, 0.38, "T"), Interval(0.38, 0.46, "DH"), Interval(0.46, 0.55, "AH0"), Interval(0.55, 0.62, "T"), Interval(0.85, 1.08, "AY1"), Interval(1.08, 1.17, "DH"), Interval(1.17, 1.34, "ER0"), Interval(1.34, 1.46, "W"), Interval(1.46, 1.51, "AH0"), Interval(1.51, 1.63, "Z"), Interval(1.69, 1.79, "AE1"), Interval(1.79, 1.83, "T"), Interval(1.83, 1.87, "AH0"), Interval(1.87, 1.98, "L"), Interval(1.98, 2.1, "AO1"), Interval(2.1, 2.21, "S"), Interval(2.21, 2.35, "F"), Interval(2.35, 2.39, "R"), Interval(2.39, 2.45, "ER0"), Interval(2.45, 2.62, "W"), Interval(2.62, 2.89, "ER1"), Interval(2.89, 3.01, "D"), Interval(3.01, 3.16, "Z"), Interval(4.03, 4.17, "IY1"), Interval(4.17, 4.37, "CH"), Interval(4.37, 4.42, "W"), Interval(4.42, 4.5, "AH1"), Interval(4.5, 4.73, "Z"), Interval(4.73, 4.81, "R"), Interval(4.81, 4.85, "EH1"), Interval(4.85, 4.89, "D"), Interval(4.89, 4

array([10, 16, 24, 26, 16, 18, 14, 46, 18, 34, 24, 10, 24, 20,  8,  8, 22,
       24, 22, 28,  8, 12, 34, 54, 24, 30, 28, 40, 10, 16, 46, 16,  8,  8,
       16, 10, 10, 20, 42, 30, 16,  6])

In [13]:
import soundfile as sf
from IPython.display import Audio

wav, sr = sf.read("/data/nzxyin/LibriTTS_R_aligned_ARPA/wavs_txts/7672/7672_96509_000005_000001.wav")
print(len(wav) / sr)
print(len(wav))
Audio(wav, rate=sr)

5.64
90240


In [14]:
ema = np.load("/data/ddsp_data/LibriTTS_R/ema/7672_96509_000005_000001.npy")
ema.shape[1]

1124

In [15]:
len(wav) / 80

1128.0

In [134]:
16000 / 50

320.0