In [148]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import argparse
import random
import torchaudio
import pyloudnorm as pyln
import glob

rate = 16000
meter = pyln.Meter(rate)

table = pq.read_table(os.path.join('..', 'data', 'TIMIT', 'train-00000-of-00002.parquet')).to_pandas()
paths = glob.glob('../**/*.flac', recursive=True)
paths.sort()


In [79]:
level = -20.0

In [174]:
def extract_timit(n):
    data = table['audio'][n]['array']
    # Normalize each sample before concat
    loudness = meter.integrated_loudness(data)
    return pyln.normalize.loudness(data, loudness, level)

def extract_lj(path):
    data, rate = torchaudio.load(path)
    data = data.numpy()[0]
    loudness = meter.integrated_loudness(data)
    return pyln.normalize.loudness(data, loudness, level)

def extracts(duration, seed=1234, gap=0.5):
    silence = np.zeros((round(gap * rate), ))
    timit_index = [('TIMIT', i) for i in range(table.shape[0])]
    lj_index = [('LJ', path) for path in sorted(paths)]
    random.seed(seed)
    index = timit_index + lj_index
    random.shuffle(index)
    res = np.zeros((0,))
    while res.shape[0] < duration * rate and len(index) > 0:
        entry = index.pop()
        kind = entry[0]
        i = entry[1]
        if kind == 'TIMIT':
            print(f'Extracting TIMIT sentence {i}')
            samples = extract_timit(i)
        if kind == 'LJ':
            print(f'Extracting LJ sentence {i}')
            samples = extract_lj(i)
        res = np.concatenate([res, samples, silence])
    return res


In [177]:
from IPython.display import Audio

waveform = extracts(20.0)
Audio(waveform, rate=rate)

Extracting LJ sentence ../data/LibriSpeech/dev-clean/3000/15664/3000-15664-0031.flac
Extracting TIMIT sentence 957
Extracting TIMIT sentence 61
Extracting TIMIT sentence 742
Extracting LJ sentence ../data/LibriSpeech/dev-clean/652/130726/652-130726-0024.flac




In [125]:
waveform.shape
import torch
torch.Tensor(waveform.reshape([1, -1])).shape

torch.Size([1, 1654741])

In [147]:
torchaudio.save('test.mp3', torch.Tensor(waveform.reshape([1, -1])), sample_rate=rate, backend='ffmpeg')

In [86]:
x = list(range(10))
x

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [88]:
random.shuffle(x)
x

[6, 9, 0, 8, 4, 1, 3, 2, 7, 5]

In [52]:
data = extract(100)
loudness = meter.integrated_loudness(data)

In [53]:
loudness

np.float64(-43.69258096947796)

In [55]:
loudness_normalized_audio = pyln.normalize.loudness(data, loudness, -18.0)

In [56]:
Audio(loudness_normalized_audio, rate=rate)

In [1]:
viseme_labels.index('Neutral')

NameError: name 'viseme_labels' is not defined