In [44]:
import torch
import ppgs
from pathlib import Path
import promonet
import pysodic
import numpy as np
import json
import torchaudio
import penn

In [3]:
jsd = ppgs.evaluate.metrics.jensenShannonDivergence

In [5]:
similarity_matrix = torch.load('balanced_similarity.pt')
def norm(ppg):
    return torch.mm(similarity_matrix.T ** 1, ppg)

def ppg_distance(ppg0, ppg1):
    return jsd(norm(ppg0), norm(ppg1), reduction=None).sum()

In [8]:
model = 'w2v2fb-ppg'
subjective_dir = Path('/repos/promonet/data/cache/ppgs-subjective')
assert subjective_dir.exists()

In [26]:
all_representations = [
    'w2v2fb',
    'w2v2fc',
    'bottleneck',
    'mel',
    'encodec'
]
all_models = []
all_models += [f'{rep}-ppg' for rep in all_representations]
all_models += [f'{rep}-latents' for rep in all_representations]

In [27]:
stems = [p.stem for p in list((subjective_dir / 'original').glob('*-100.wav'))]
len(stems)

100

In [29]:
SHIFT_CENTS = [-200, 200]
ratios = [2 ** (cents / 1200) for cents in SHIFT_CENTS]
ratio_strings = [f'{int(ratio * 100):03d}' for ratio in ratios]
ratio_strings

['089', '112']

In [33]:
for model in all_models:
    total = 0
    count = 0
    
    for stem in stems:
        for ratio in ratio_strings:
            shifted_stem = stem[:stem.rindex('-')] + f'-{ratio}'
            original_ppg = torch.load(subjective_dir / 'original' / (stem + '-w2v2fb-ppg.pt'))
            other_ppg = torch.load(subjective_dir / model / (shifted_stem + '-w2v2fb-ppg.pt'))
            if original_ppg.shape[-1] - other_ppg.shape[-1] <= 2:
                original_ppg = original_ppg[..., :other_ppg.shape[-1]]
            assert original_ppg.shape == other_ppg.shape, f'{original_ppg.shape}, {other_ppg.shape}'

        total += ppg_distance(original_ppg, other_ppg)
        count += other_ppg.shape[-1]
    print(f'{model}: {total/count}')

w2v2fb-ppg: 0.04583841562271118
w2v2fc-ppg: 0.07784371823072433
bottleneck-ppg: 0.07088759541511536
mel-ppg: 0.05224919691681862
encodec-ppg: 0.05325319990515709
w2v2fb-latents: 0.0397956520318985
w2v2fc-latents: 0.04161763936281204
bottleneck-latents: 0.12780499458312988
mel-latents: 0.03635498881340027
encodec-latents: 0.039325352758169174


In [39]:
all_representations = [
    'w2v2fb',
    'w2v2fc',
    'bottleneck',
    'mel',
    'encodec'
]
all_models = []
all_models += [f'{rep}-ppg' for rep in all_representations]
# all_models += [f'{rep}-latents' for rep in all_representations]
results = {}
for model in all_models:
    
    for stem in stems:
        key = f'{model}-{stem[:-4]}'
        total = 0
        count = 0
        for ratio in ratio_strings:
            shifted_stem = stem[:stem.rindex('-')] + f'-{ratio}'
            original_ppg = torch.load(subjective_dir / 'original' / (stem + '-w2v2fb-ppg.pt'))
            other_ppg = torch.load(subjective_dir / model / (shifted_stem + '-w2v2fb-ppg.pt'))
            if original_ppg.shape[-1] - other_ppg.shape[-1] <= 2:
                original_ppg = original_ppg[..., :other_ppg.shape[-1]]
            assert original_ppg.shape == other_ppg.shape, f'{original_ppg.shape}, {other_ppg.shape}'

            total += ppg_distance(original_ppg, other_ppg)
            count += other_ppg.shape[-1]
        results[key] = (total/count).item()
with open('jsd.json', 'w+') as f:
    json.dump(results, f)

In [40]:
def audio_to_pitch(audio, sample_rate):
    return penn.from_audio(
        audio,
        sample_rate,
        hopsize=promonet.convert.samples_to_seconds(promonet.HOPSIZE),
        fmin=promonet.FMIN,
        fmax=promonet.FMAX,
        pad=True,
        interp_unvoiced_at=0.1625,
        gpu=0
    )

In [49]:
all_representations = [
    'w2v2fb',
    'w2v2fc',
    'bottleneck',
    'mel',
    'encodec'
]
all_models = []
all_models += [f'{rep}-ppg' for rep in all_representations]
# all_models += [f'{rep}-latents' for rep in all_representations]
results = {}
for model in all_models:
    
    for stem in stems:
        key = f'{model}-{stem[:-4]}'
        total = 0
        count = 0
        metrics = pysodic.metrics.Pitch()
        for ratio_val, ratio in zip(ratios, ratio_strings):
            shifted_stem = stem[:stem.rindex('-')] + f'-{ratio}'
            original_audio, original_sr = torchaudio.load(subjective_dir / 'original' / (stem + '.wav'))
            other_audio, other_sr = torchaudio.load(subjective_dir / model / (shifted_stem + '.wav'))
            
            original_pitch, original_periodicity = audio_to_pitch(original_audio, original_sr)
            original_pitch = original_pitch * ratio_val
            other_pitch, other_periodicity = audio_to_pitch(other_audio, other_sr)
            
            if original_pitch.shape[-1] - other_pitch.shape[-1] <= 2:
                original_pitch = original_pitch[..., :other_pitch.shape[-1]]
                original_periodicity = original_periodicity[..., :other_periodicity.shape[-1]]
            assert original_pitch.shape == other_pitch.shape, f'{original_pitch.shape}, {other_pitch.shape}'
            assert original_periodicity.shape == other_periodicity.shape, f'{original_periodicity.shape}, {other_periodicity.shape}'
            
            original_voicing = original_periodicity > 0.1625
            other_voicing = other_periodicity > 0.1625

            metrics.update(original_pitch, original_voicing, other_pitch, other_voicing)
        results[key] = metrics()
with open('pitch.json', 'w+') as f:
    json.dump(results, f)

In [61]:
all_representations = [
    'w2v2fb',
    'w2v2fc',
    'bottleneck',
    'mel',
    'encodec'
]
all_models = []
all_models += [f'{rep}-ppg' for rep in all_representations]
# all_models += [f'{rep}-latents' for rep in all_representations]
results = {}
for model in all_models:

    for stem in stems:
        key = f'{model}-{stem[:-4]}'
        total = 0
        count = 0
        metrics = promonet.evaluate.metrics.WER(gpu=0)
        for ratio in ratio_strings:
            shifted_stem = stem[:stem.rindex('-')] + f'-{ratio}'
            speaker = stem.split('-')[0]
            stem_sans_speaker = stem[stem.index('-')+1:][:-4]
            text = promonet.load.text(promonet.CACHE_DIR / 'vctk' / speaker / (stem_sans_speaker + '.txt'))
            metrics.update(text, str(subjective_dir / model / (shifted_stem + '.wav')))
        results[key] = metrics()
with open('wer.json', 'w+') as f:
    json.dump(results, f)