In [None]:
import json
import librosa
import pandas as pd
import re
import soundfile as sf
import torch

from IPython.display import display, Audio
from transformers import VitsModel, AutoTokenizer

from utils import modulate_spectrum, plot_spectrum, read_audio_spectrum, spectrum_to_audio

### PT

In [None]:
with open("./txts/f_pessoa.json", "r") as json_data:
    fpq = json.load(json_data)
poemas_pt = [t["texto_poesia"] for t in fpq if len(t["texto_poesia"]) > 0 and len(t["texto_poesia"]) < 65]

In [None]:
len(poemas_pt), max([len(p) for p in poemas_pt])

In [None]:
with open("./txts/pt.json", 'w', encoding="utf-8") as f:
    json.dump(poemas_pt[:512], f, ensure_ascii=False, indent=2)

### SP

In [None]:
p_df = pd.read_csv("./txts/poemas.csv")
poemas_txt_sp = p_df["content"].values

In [None]:
poemas_sp = [re.sub(r"^\n", "", p.replace("\n\n", "\n")).split("\n") for p in poemas_txt_sp if isinstance(p, str)]
poemas_sp = [p for p in poemas_sp if len(p) < 65]

In [None]:
len(poemas_sp), max([len(p) for p in poemas_sp])

In [None]:
with open("./txts/sp.json", 'w', encoding="utf-8") as f:
    json.dump(poemas_sp[:512], f, ensure_ascii=False, indent=2)

### TTS

In [None]:
model_url_pt = "facebook/mms-tts-por"
model_url_sp = "facebook/mms-tts-spa"

model_pt = VitsModel.from_pretrained(model_url_pt)
tokenizer_pt = AutoTokenizer.from_pretrained(model_url_pt)

model_sp = VitsModel.from_pretrained(model_url_sp)
tokenizer_sp = AutoTokenizer.from_pretrained(model_url_sp)

In [None]:
txt = " ''' ".join(poemas_pt[0][:5])
print(txt)

inputs = tokenizer_pt(txt, return_tensors="pt")

with torch.no_grad():
    output_pt = model_pt(**inputs).waveform

output_pt = output_pt.cpu().data.numpy().squeeze()
output_pt = librosa.resample(output_pt, orig_sr=model_pt.config.sampling_rate, target_sr=22050)

display(Audio(output_pt, rate=22050))
sf.write('./tmp.wav', output_pt, 22050)

In [None]:
txt = " ".join(poemas_sp[0][:5])
print(txt)

inputs = tokenizer_sp(txt, return_tensors="pt")

with torch.no_grad():
    output_sp = model_sp(**inputs).waveform

output_sp = output_sp.cpu().data.numpy().squeeze()
output_sp = librosa.resample(output_sp, orig_sr=model_sp.config.sampling_rate, target_sr=22050)

display(Audio(output_sp, rate=22050))
sf.write('./tmp.wav', output_sp, 22050)

In [None]:
STYLE_PATH = "wavs/birds/MEX_ALTAMIRA_ORIOLE/MEX_Altamira_Oriole-ACelisM_01.mp3"
STYLE_PATH = "wavs/birds/BR_ALAGOAS_FOLIAGE/BR_AL_XC181063-PHINOV36_0101_LIMPO.mp3"

mod_s, mod_sr, mod_p = modulate_spectrum("./tmp.wav", STYLE_PATH)

In [None]:
plot_spectrum(mod_s)
mod_wav = spectrum_to_audio(mod_s)
display(Audio(mod_wav, rate=mod_sr))