In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import random

import IPython.display as ipd
import librosa
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy
import torch

import promonet

In [None]:
def plot_formants(audio):
    plt.figure(figsize=(12, 12))

    # Compute pitch and periodicity
    pitch, periodicity = promonet.preprocess.from_audio(
        audio,
        gpu=0,
        features=['pitch', 'periodicity'])
    pitch, periodicity = pitch.cpu(), periodicity.cpu()

    # Compute formants
    formants = promonet.formants.from_audio(
        audio,
        promonet.SAMPLE_RATE,
        pitch=pitch,
        features='stft',
        decoder='viterbi')
    
    # Compute STFT
    features, frequencies = promonet.formants.stft(
        audio,
        promonet.SAMPLE_RATE)
    features, frequencies = features.numpy(), frequencies.numpy()
    
    # Compute frame times
    times = np.linspace(
        0,
        promonet.convert.samples_to_seconds(audio.shape[-1]),
        features.shape[0])
    
    # Plot STFT
    plt.pcolormesh(times, frequencies, features[:-1, :-1].T)
    plt.yscale('symlog')
    
    # Overlay pitch
    unvoiced = periodicity[0] < promonet.VOICING_THRESHOLD
    x = torch.clone(pitch).squeeze()
    x[unvoiced] = float('nan')
    plt.plot(times, x, linewidth=2, color='black', label='Pitch')
    
    # Overlay formants
    colors = ['orange', 'green', 'red', 'gray', 'yellow', 'teal']
    for i, formant in enumerate(formants):
        x = torch.clone(formant)
        x[unvoiced] = float('nan')
        plt.plot(
            times,
            x,
            linewidth=2,
            color=colors[i],
            linestyle='--',
            label=f'F{i}')

    ipd.display(ipd.Audio(audio, rate=promonet.SAMPLE_RATE))

In [None]:
dataset = 'vctk'
stems = promonet.load.partition(dataset)['train']
stem = random.choice(stems)
file = promonet.CACHE_DIR / dataset / f'{stem}-100.wav'
audio = promonet.load.audio(file)

In [None]:
plot_formants(audio)

In [None]:
# Compute pitch and periodicity
pitch, periodicity = promonet.preprocess.from_audio(
    audio,
    gpu=0,
    features=['pitch', 'periodicity'])
pitch, periodicity = pitch.cpu(), periodicity.cpu()

# Compute formants
formants, stages = promonet.formants.from_audio(
    audio,
    promonet.SAMPLE_RATE,
    pitch=pitch,
    features='stft',
    decoder='viterbi')

print(formants.mean(dim=1))

for formant in formants:
    plt.plot(formant)

In [None]:
# Compute STFT
features, frequencies = promonet.formants.stft(
    audio,
    promonet.SAMPLE_RATE)
features, frequencies = features.numpy(), frequencies.numpy()

for stage in stages:
    plt.figure()
        
    # Compute frame times
    times = np.linspace(
        0,
        promonet.convert.samples_to_seconds(audio.shape[-1]),
        features.shape[0])
    
    # Plot STFT
    plt.pcolormesh(times, frequencies, stage[:-1, :-1].T)
    plt.yscale('symlog')
        
    # Overlay pitch
    unvoiced = periodicity[0] < promonet.VOICING_THRESHOLD
    x = torch.clone(pitch).squeeze()
    x[unvoiced] = float('nan')
    plt.plot(times, x, linewidth=2, color='black', label='Pitch')

In [None]:
# Compute pitch and periodicity
pitch, periodicity = promonet.preprocess.from_audio(
    audio,
    gpu=0,
    features=['pitch', 'periodicity'])
pitch, periodicity = pitch.cpu().to(torch.float32), periodicity.cpu().to(torch.float32)

In [None]:
world_pitch, spectrogram, aperiodicity = promonet.baseline.world.analyze(
    audio.numpy().squeeze(),
    promonet.convert.samples_to_frames(audio.shape[-1]))
plt.plot(world_pitch)
plt.plot(pitch.squeeze())

In [None]:
# Original
original_audio = promonet.load.audio('original.wav')
original_formants = promonet.formants.from_audio(
    original_audio,
    promonet.SAMPLE_RATE,
    features='stft',
    decoder='viterbi')
print(original_formants.mean(dim=1))
for formant in original_formants:
    plt.plot(formant)
ipd.Audio(original_audio, rate=promonet.SAMPLE_RATE)

In [None]:
# Down
down_audio = promonet.load.audio('formant-071.wav')
down_formants = promonet.formants.from_audio(
    down_audio,
    promonet.SAMPLE_RATE,
    features='stft',
    decoder='viterbi')
print(down_formants.mean(dim=1))
for formant in down_formants:
    plt.plot(formant)
ipd.Audio(down_audio, rate=promonet.SAMPLE_RATE)

In [None]:
# Up
up_audio = promonet.load.audio('formant-141.wav')
print(up_formants.mean(dim=1))
up_formants = promonet.formants.from_audio(
    up_audio,
    promonet.SAMPLE_RATE,
    features='stft',
    decoder='viterbi')
for formant in up_formants:
    plt.plot(formant)
ipd.Audio(up_audio, rate=promonet.SAMPLE_RATE)

In [None]:
# Visualize all formants
colors = ['blue', 'orange', 'green', 'red']
plt.figure(figsize=(18, 18))
chunk_size = None
voiced = torch.clone(pitch)
voiced[periodicity < promonet.VOICING_THRESHOLD] = float('nan')
plt.plot(voiced[0, :chunk_size], color='black')
for color, original, down, up in zip(
    colors,
    original_formants,
    down_formants,
    up_formants
):
    original, down, up = original[None], down[None], up[None]
    original[periodicity < 2.0 * promonet.VOICING_THRESHOLD] = float('nan')
    down[periodicity < 2.0 * promonet.VOICING_THRESHOLD] = float('nan')
    up[periodicity < 2.0 * promonet.VOICING_THRESHOLD] = float('nan')
    plt.plot(
        original[0, :chunk_size],
        color=color,
        linestyle='solid',
        label=f'{torch.min(original[~torch.isnan(original)]):.02f}')
    plt.plot(
        down[0, :chunk_size],
        color=color,
        linestyle='dotted',
        label=f'{torch.min(down[~torch.isnan(down)]):.02f}')
    plt.plot(
        up[0, :chunk_size],
        color=color,
        linestyle='dashed',
        label=f'{torch.min(up[~torch.isnan(up)]):.02f}')
plt.legend()

In [None]:
print(down_formants.mean(dim=1))
print(original_formants.mean(dim=1))
print(up_formants.mean(dim=1))