In [None]:
import json
import librosa
import pandas as pd
import re
import torch

from IPython.display import display, Audio
from transformers import VitsModel, AutoTokenizer

from model import RandomCNN, run_transfer
from impulse import ImpulsePT, ImpulseSP, average_spectrum_frequencies, modulate_spectrum
from utils import audio_to_spectrum, plot_spectrum, read_audio_spectrum, spectrum_to_audio

### PT

In [None]:
with open("./txts/f_pessoa.json", "r") as json_data:
    fpq = json.load(json_data)
poemas_pt = [t["texto_poesia"] for t in fpq if len(t["texto_poesia"]) > 0 and len(t["texto_poesia"]) < 65]

In [None]:
len(poemas_pt), max([len(p) for p in poemas_pt])

In [None]:
with open("./txts/pt.json", 'w', encoding="utf-8") as f:
    json.dump(poemas_pt[:512], f, ensure_ascii=False, indent=2)

### SP

In [None]:
p_df = pd.read_csv("./txts/poemas.csv")
poemas_txt_sp = p_df["content"].values

In [None]:
poemas_sp = [re.sub(r"^\n", "", p.replace("\n\n", "\n")).split("\n") for p in poemas_txt_sp if isinstance(p, str)]
poemas_sp = [p for p in poemas_sp if len(p) < 65]

In [None]:
len(poemas_sp), max([len(p) for p in poemas_sp])

In [None]:
with open("./txts/sp.json", 'w', encoding="utf-8") as f:
    json.dump(poemas_sp[:512], f, ensure_ascii=False, indent=2)

### TTS

In [None]:
model_url_pt = "facebook/mms-tts-por"
model_url_sp = "facebook/mms-tts-spa"

model_pt = VitsModel.from_pretrained(model_url_pt)
tokenizer_pt = AutoTokenizer.from_pretrained(model_url_pt)

model_sp = VitsModel.from_pretrained(model_url_sp)
tokenizer_sp = AutoTokenizer.from_pretrained(model_url_sp)

In [None]:
txt = " ''' ".join(poemas_pt[0][:5])
print(txt)

inputs = tokenizer_pt(txt, return_tensors="pt")

with torch.no_grad():
    output_pt = model_pt(**inputs).waveform

output_pt = output_pt.cpu().data.numpy().squeeze()
output_pt = librosa.resample(output_pt, orig_sr=model_pt.config.sampling_rate, target_sr=22050)

display(Audio(output_pt, rate=22050))

In [None]:
txt = " ".join(poemas_sp[0][:5])
print(txt)

inputs = tokenizer_sp(txt, return_tensors="pt")

with torch.no_grad():
    output_sp = model_sp(**inputs).waveform

output_sp = output_sp.cpu().data.numpy().squeeze()
output_sp = librosa.resample(output_sp, orig_sr=model_sp.config.sampling_rate, target_sr=22050)

display(Audio(output_sp, rate=22050))

In [None]:
STYLE_PATH = "wavs/birds/MEX_ALTAMIRA_ORIOLE/MEX_Altamira_Oriole-ACelisM_01.mp3"
STYLE_PATH = "wavs/birds/BR_ALAGOAS_FOLIAGE/BR_AL_XC181063-PHINOV36_0101_LIMPO.mp3"

(content_s, content_p), content_sr = audio_to_spectrum(output_sp, 22050), 22050
style_s, style_p, style_sr = read_audio_spectrum(STYLE_PATH)

mod_s = modulate_spectrum(content_s, style_s)

In [None]:
plot_spectrum(mod_s)
mod_wav = spectrum_to_audio(mod_s)
display(Audio(mod_wav, rate=content_sr))

### Impulse

In [None]:
sp = ImpulseSP()

In [None]:
content_wav, content_sr = sp.get_impulse(1717)
content_s, content_p = audio_to_spectrum(content_wav, content_sr)

content_wav = spectrum_to_audio(content_s)
display(Audio(content_wav, rate=content_sr))

In [None]:
STYLE_PATH = "wavs/birds/BR_ALAGOAS_FOLIAGE/BR_AL_XC181063-PHINOV36_0101_LIMPO.mp3"
#STYLE_PATH = "wavs/birds/MEX_ALTAMIRA_ORIOLE/MEX_Altamira_Oriole-ACelisM_01.mp3"

style_s, style_p, style_sr = read_audio_spectrum(STYLE_PATH)

In [None]:
plot_spectrum(content_s)
content_wav = spectrum_to_audio(content_s)
display(Audio(content_wav, rate=content_sr))

plot_spectrum(style_s)
style_wav = spectrum_to_audio(style_s)
display(Audio(style_wav, rate=style_sr))

In [None]:
%%time
kx = 17
ky = 17
mcnn = RandomCNN(out_channels=400, kernel=(kx, ky), stride=(kx - 2, ky - 2))
result = run_transfer(mcnn, content_s, style_s, num_steps=1000, content_weight=1, style_weight=1e14)
result_s = result.cpu().data.numpy().squeeze().clip(0, 1000)

In [None]:
plot_spectrum(result_s)
result_wav = spectrum_to_audio(result_s)
display(Audio(result_wav, rate=style_sr))

In [None]:
content_s_avg = average_spectrum_frequencies(content_s)*1e-4

plot_spectrum(content_s_avg)
content_avg_wav = spectrum_to_audio(content_s_avg)
display(Audio(content_avg_wav, rate=content_sr))

In [None]:
%%time
kx = 17
ky = 17
mcnn = RandomCNN(out_channels=400, kernel=(kx, ky), stride=(kx - 2, ky - 2))
result = run_transfer(mcnn, content_s_avg, style_s, num_steps=1000, content_weight=1, style_weight=1e11)
result_avg_s = result.cpu().data.numpy().squeeze().clip(0, 1000)

In [None]:
plot_spectrum(result_avg_s)
result_avg_wav = spectrum_to_audio(result_avg_s)
display(Audio(result_avg_wav, rate=content_sr))

In [None]:
mod_s = modulate_spectrum(content_s, style_s)

plot_spectrum(mod_s)
mod_wav = spectrum_to_audio(mod_s)
display(Audio(mod_wav, rate=content_sr))

### Experimental

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
CONTENT_PATH = "wavs/birds/MEX_ALTAMIRA_ORIOLE/MEX_Altamira_Oriole-ACelisM_01.mp3"
STYLE_PATH = "wavs/birds/BR_ALAGOAS_FOLIAGE/BR_AL_XC181063-PHINOV36_0101_LIMPO.mp3"

In [None]:
cw, csr = librosa.load(CONTENT_PATH, sr=22050)
sw, ssr = librosa.load(STYLE_PATH, sr=22050)

In [None]:
ww = np.convolve(cw, sw, mode="same")

In [None]:
plt.plot(cw)
plt.show()
display(Audio(cw, rate=csr))

plt.plot(sw)
plt.show()
display(Audio(sw, rate=ssr))

plt.plot(ww)
plt.show()
display(Audio(ww, rate=csr))

In [None]:
def convconv(wave):
    lw = len(wave)
    cnv = np.convolve(wave, wave[lw//2 - lw//6:lw//2 + lw//6], mode="valid")
    return 2.0 * ((cnv - cnv.min()) / cnv.ptp()) - 1.0

In [None]:
ww = convconv(sw)

plt.plot(ww)
plt.show()
display(Audio(ww, rate=csr))