In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from pathlib import Path

import IPython.display as ipd
import torch

import promonet

  from .autonotebook import tqdm as notebook_tqdm


In [70]:
# Conditions to consider
conditions = [
    'base',
    # 'ablate-augment',
    # 'ablate-multiloud',
    # 'ablate-sppg',
    # 'ablate-variable-pitch',
    # 'ablate-viterbi',
    # 'mels',
    # 'mels-ours',
    'psola',
    'world'
]
edits = [
    # 'reconstructed-100',
    # 'scaled-071',
    # 'scaled-141',
    'shifted-071',
    'shifted-141',
    # 'stretched-071',
    # 'stretched-141'
]
metrics = [
    'pitch',
    'periodicity',
    'loudness',
    'ppg',
    'wer',
    'speaker_similarity',
    'formant-average',
]

## Parse objective results on a set of conditions

In [71]:
def parse_results(conditions, edits, metric, dataset):
    results = {condition: {} for condition in conditions}
    for condition in conditions:
        with open(f'/repos/promonet/results/{condition}/{dataset}/results.json') as file:
            for edit, metrics in json.load(file).items():
                if edit not in edits:
                    continue
                # print(edit, json.dumps(metrics, indent=4, sort_keys=True))
                try:
                    results[condition][edit] = metrics[metric]
                except KeyError:
                    pass
    for condition in conditions:
        values = list(results[condition].values())
        results[condition]['average'] = sum(values) / len(values)
    print(
        json.dumps(
            {condition: results[condition]['average'] for condition in conditions},
            indent=4,
            sort_keys=True))


In [72]:
for metric in metrics:
    print(metric)
    parse_results(conditions, edits, metric, 'vctk')

pitch
{
    "base": 22.47694954276085,
    "psola": 21.637411415576935,
    "world": 16.645088978111744
}
periodicity
{
    "base": 0.08970813108900742,
    "psola": 0.11457670857423691,
    "world": 0.13748958280400075
}
loudness
{
    "base": 2.1688455637299335,
    "psola": 1.664705951320713,
    "world": 1.932305153301618
}
ppg
{
    "base": 0.13720866292715073,
    "psola": 0.10912970080971718,
    "world": 0.2705831900238991
}
wer
{
    "base": 0.023517733439803123,
    "psola": 0.005624999990686774,
    "world": 0.005543831503018737
}
speaker_similarity
{
    "base": 0.7259168028831482,
    "psola": 0.7796521186828613,
    "world": 0.6680735647678375
}
formant-average


ZeroDivisionError: division by zero

## File-level inspection of objective results

In [None]:
# Load fine-grained objective results
condition = 'sppg-percentile-085'
results = {}
for file in Path(f'/repos/promonet/results/{condition}/vctk').glob('0*.json'):
    with open(file) as file:
        results |= json.load(file)['objective']['raw']

In [None]:
# Sort files by a specific metric
metric = 'wer'
metric_results = {}
for key, edit_metrics in results.items():
    edit = list(edit_metrics.keys())[0]
    if 'original' not in key:
        continue
    metric_results[key] = edit_metrics[edit][metric]
metric_results = dict(sorted(metric_results.items(), key=lambda item: item[1], reverse=True))

In [None]:
subjective_directory = Path('/repos/promonet/eval/subjective')
objective_directory = Path('/repos/promonet/eval/objective')
for i, stem in enumerate(metric_results):

    if i > 10:
        break
    print(stem, metric_results[stem])
    predicted = promonet.load.audio(subjective_directory / condition / f'{stem}.wav')
    ipd.display(ipd.Audio(predicted, rate=promonet.SAMPLE_RATE))
    parts = stem.split('-')
    file = subjective_directory / 'original' / f'{"-".join(parts[:3])}-original-100.wav'
    print(file)
    ipd.display(ipd.Audio(file))
    print(promonet.load.text(objective_directory / condition / f'{stem}.txt'))
    print(promonet.load.text(objective_directory / 'original' / f'{stem}.txt'))
    frames = promonet.convert.samples_to_frames(predicted.shape[-1])
    figure = promonet.plot.from_features(
        predicted,
        torch.load(objective_directory / condition / f'{stem}-viterbi-pitch.pt'),
        torch.load(objective_directory / condition / f'{stem}-viterbi-periodicity.pt'),
        promonet.loudness.band_average(torch.load(objective_directory / condition / f'{stem}-loudness.pt'), 1),
        promonet.load.ppg(objective_directory / condition / f'{stem}-ppg.pt', frames),
        torch.load(objective_directory / 'original' / f'{stem}-viterbi-pitch.pt'),
        torch.load(objective_directory / 'original' / f'{stem}-viterbi-periodicity.pt'),
        promonet.loudness.band_average(torch.load(objective_directory / 'original' / f'{stem}-loudness.pt'), 1),
        promonet.load.ppg(objective_directory / 'original' / f'{stem}-ppg.pt', frames))
    figure.show()