In [None]:
%cd ~/tesis/VoiceClonAR

In [None]:
from pathlib import Path

from voiceclonar.quality_assessment.evaluation import SyntheticSpeechQA

In [None]:
tacotron2_path = Path("test/audios/tacotron2")
fastpitch_path = Path("test/audios/fastpitch")
fastpitch_rvc_path = Path("test/audios/fastpitch_rvc")

evaluator = SyntheticSpeechQA()
tacotron2_results = evaluator.evaluate_folder(tacotron2_path, reference_suffix="reference")
fastpitch_results = evaluator.evaluate_folder(fastpitch_path, reference_suffix="reference")
fastpitch_rvc_results = evaluator.evaluate_folder(fastpitch_rvc_path, reference_suffix="reference")

In [None]:
display(tacotron2_results)
display(fastpitch_results)
display(fastpitch_rvc_results)

In [None]:
import pandas as pd

pd.concat([tacotron2_results, fastpitch_results, fastpitch_rvc_results], ignore_index=True)

In [None]:
import nemo.collections.asr as nemo_asr
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large")


In [None]:
from voiceclonar.quality_assessment.feature_extraction import TitaNetEmbeddingExtractor

arf_00295_fastpitch_rvc = "test/audios/arf_00295_fastpitch_rvc.wav"
arf_00295_fastpitch = "test/audios/arf_00295_fastpitch.wav"
arf_00295_reference = "test/audios/arf_00295_reference.wav"
arm_00610_fastpitch_rvc = "test/audios/arm_00610_fastpitch_rvc.wav"
arm_00610_reference = "test/audios/arm_00610_reference.wav"


feature_extractor = TitaNetEmbeddingExtractor()

embed_arf_fp_rvc = feature_extractor.process_audio(arf_00295_fastpitch_rvc)
embed_arf_fp = feature_extractor.process_audio(arf_00295_fastpitch)
embed_arf_ref = feature_extractor.process_audio(arf_00295_reference)
embed_arm_fp_rvc = feature_extractor.process_audio(arm_00610_fastpitch_rvc)
embed_arm_ref = feature_extractor.process_audio(arm_00610_reference)

In [None]:
evaluator = SyntheticSpeechQA()
evaluator.measure_similarity(embed_arm_ref, embed_arm_fp_rvc)

In [None]:
from voiceclonar.utils import load_config

CFG_PATH = Path("voiceclonar/quality_assessment/cfg.yaml")


In [None]:
cfg = load_config(CFG_PATH)

In [None]:
arf_00295_fastpitch_rvc = "test/audios/arf_00295_fastpitch_rvc.wav"
arf_00295_fastpitch = "test/audios/arf_00295_fastpitch.wav"
arf_00295_reference = "test/audios/arf_00295_reference.wav"
arm_00610_fastpitch_rvc = "test/audios/arm_00610_fastpitch_rvc.wav"
arm_00610_reference = "test/audios/arm_00610_reference.wav"

model_name = "nisqa"

model_args = cfg.metrics.nisqa.__dict__
model_args["pretrained_model"] = f"{model_args['weights']}/{model_name}.tar"
model_args["deg"] = arf_00295_fastpitch_rvc

from voiceclonar.quality_assessment.nisqa.NISQA_model import nisqaModel

nisqaModel(model_args).predict()

In [None]:
from voiceclonar.quality_assessment.nisqa.NISQA_model import nisqaModel

df = nisqaModel(model_args).predict()

In [None]:
df.iloc[0,1]

### Frechet distance

In [None]:
%cd ~/tesis/VoiceClonAR

In [None]:
from pathlib import Path
import torchaudio

audios_dict = {
"arf_00295_fastpitch_rvc": "test/audios/arf_00295_fastpitch_rvc.wav",
"arf_00295_fastpitch":  "test/audios/arf_00295_fastpitch.wav",
"arf_00295_reference":  "test/audios/arf_00295_reference.wav",
"arm_00610_fastpitch_rvc":  "test/audios/arm_00610_fastpitch_rvc.wav",
"arm_00610_reference":  "test/audios/arm_00610_reference.wav",
}

for name, path in audios_dict.items():
    signal_array, sample_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sample_rate, 16_000)
    audios_dict[name] = resampler(signal_array).squeeze().numpy()

In [None]:
from voiceclonar.quality_assessment.frechet_audio_distance import FrechetAudioDistance

# to use `vggish`
frechet = FrechetAudioDistance(
    model_name="vggish",
    sample_rate=16000,
    use_pca=False, 
    use_activation=False,
    verbose=False
)

In [None]:
frechet.score("test/audios/arf_00295_fastpitch/", "test/audios/arf_00295_fastpitch_rvc/")

In [None]:
frechet.score("test/audios/arf_00295_fastpitch/arf_00295_fastpitch.wav", "test/audios/arf_00295_fastpitch_rvc/arf_00295_fastpitch_rvc.wav")