## VO updated

* Does takes instead of a single generation
* Each take is subject to analysis
* Regenerate if failed analysis
* Analysis done:
  * Whisper: Can I understand it
  * Noise: Is it noisy?
  * Silence: Is there lots of silences
  * Voice similarity
  * Does it start and end with a bit of silence

In [1]:
from tqdm.notebook import trange, tqdm
from bark import SAMPLE_RATE

from bark import api
from IPython.display import Audio, display

### Generate audio to analyze

In [None]:
text_temp = 0.7  # Controls how much the text can be changed? 1 means no change, 0.01 is basically noise
waveform_temp = 0.7  # Not sure it controls anything specific value of 1 had lots of background? 0.01 is basically noise
output_full = True
sentence = "I'm tired of life, I just want to die [laughs]"
prompt_name = "/d/code/aicp/speakers/heather.npz"
audio = api.generate_audio(
    sentence,
    silent=True,
    text_temp=text_temp,
    waveform_temp=waveform_temp,
    history_prompt=prompt_name,
    output_full=False,
)

In [None]:
Audio(audio, rate=SAMPLE_RATE)

#### Speech2Text Whisper/Vosk

In [2]:
import whisper
import torch

whisper_model = whisper.load_model("base.en")

In [None]:
from vosk import Model, KaldiRecognizer, SetLogLevel

# You can set log level to -1 to disable debug messages
SetLogLevel(0)

vosk_model = Model(
    "/f/models/speech2text/vosk-model-en-us-0.42-gigaspeech", lang="en-us"
)

# You can also init model by name or with a folder path
# model = Model(model_name="vosk-model-en-us-0.21")
# model = Model("models/en")

In [None]:
whisper_model.transcribe(torch.Tensor(audio))

In [None]:
chunk_size = SAMPLE_RATE


# Process the audio data in chunks.
for i in range(0, len(audio), chunk_size):
    chunk = audio[i : i + chunk_size]
    # Vosk expects the audio data in bytes, so we convert the integers to bytes.
    data = chunk.astype(np.int16).tobytes()

    if rec.AcceptWaveform(data):
        print(rec.Result())

# Get the final results.
print(rec.FinalResult())

#### Noise

* An SNR of 20-25 dB is considered acceptable.
* An SNR of 25-30 dB is considered good.
* An SNR of 30-35 dB is considered excellent.

In [None]:
import numpy as np

In [None]:
def compute_snr(audio_signal):
    """
    Compute the Signal-to-Noise Ratio (SNR) of an audio signal.

    Parameters:
    audio_signal (np.array): The audio signal.

    Returns:
    float: The SNR of the audio signal in decibels.
    """

    # Compute the Fast Fourier Transform (FFT)
    fft = np.fft.fft(audio_signal)

    # Compute the Power Spectral Density (PSD)
    psd = np.abs(fft) ** 2

    # Define the threshold as the mean power
    threshold = np.mean(psd)

    # Separate the signal and the noise
    signal_psd = psd[psd > threshold]
    noise_psd = psd[psd <= threshold]

    # Compute the Signal-to-Noise Ratio (SNR)
    snr = 10 * np.log10(np.mean(signal_psd) / np.mean(noise_psd))

    return snr

In [None]:
compute_snr(audio)

### Silence

Detecting silence in clips

In [None]:
import librosa

In [None]:
def detect_silence(audio_signal, sampling_rate, window_length=0.1):
    normalized_signal = audio_signal / np.max(np.abs(audio_signal))
    samples_per_window = int(window_length * sampling_rate)
    rms_energy = librosa.feature.rms(
        y=normalized_signal,
        frame_length=samples_per_window,
        hop_length=samples_per_window,
        center=True,
    )[0]

    # Estimate the silence threshold as the minimum non-zero RMS energy
    silence_threshold = np.min(rms_energy[rms_energy > 0])

    silence = rms_energy < silence_threshold
    return silence


def compute_silent_periods(silence, sampling_rate, window_length=0.1):
    change_indices = np.where(np.diff(silence))[0]
    change_indices = np.concatenate(([0], change_indices, [len(silence) - 1]))
    periods = [
        (i * window_length, (i + 1) * window_length, silence[i]) for i in change_indices
    ]
    print(f"Silence at the beginning: {periods[0][1] if periods[0][2] else 0} seconds")
    print(
        f"Silence in the middle: {sum((end - start) for start, end, is_silent in periods if is_silent)} seconds"
    )
    print(
        f"Silence at the end: {periods[-1][1] - periods[-1][0] if periods[-1][2] else 0} seconds"
    )
    return periods

In [None]:
# Detect silence
silence = detect_silence(audio, SAMPLE_RATE)

# Compute the silent periods
silent_periods = compute_silent_periods(audio, SAMPLE_RATE)

### Enhance 

In [3]:
from vocos import Vocos
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(device)

In [4]:
from typing import Optional, Union, Dict

import numpy as np
from bark.generation import generate_coarse, generate_fine


def semantic_to_audio_tokens(
    semantic_tokens: np.ndarray,
    history_prompt: Optional[Union[Dict, str]] = None,
    temp: float = 0.7,
    silent: bool = False,
    output_full: bool = False,
):
    coarse_tokens = generate_coarse(
        semantic_tokens,
        history_prompt=history_prompt,
        temp=temp,
        silent=silent,
        use_kv_caching=True,
    )
    fine_tokens = generate_fine(coarse_tokens, history_prompt=history_prompt, temp=0.5)

    if output_full:
        full_generation = {
            "semantic_prompt": semantic_tokens,
            "coarse_prompt": coarse_tokens,
            "fine_prompt": fine_tokens,
        }
        return full_generation
    return fine_tokens

In [5]:
from bark import text_to_semantic

history_prompt = "/d/code/aicp/speakers/heather.npz"
sentence = "I'm tired of life, I just want to die [laughs]"
semantic_tokens = text_to_semantic(
    sentence,
    history_prompt=history_prompt,
    temp=0.7,
    silent=False,
)
audio_tokens = semantic_to_audio_tokens(
    semantic_tokens,
    history_prompt=history_prompt,
    temp=0.7,
    silent=False,
    output_full=False,
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 29.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:10<00:00,  1.88it/s]


In [6]:
from bark.generation import codec_decode

encodec_output = codec_decode(audio_tokens)

import torchaudio

# Upsample to 44100 Hz for better reproduction on audio hardware
encodec_output = torchaudio.functional.resample(
    torch.from_numpy(encodec_output), orig_freq=24000, new_freq=44100
)
Audio(encodec_output, rate=44100)

In [7]:
audio_tokens_torch = torch.from_numpy(audio_tokens).to(device)
features = vocos.codes_to_features(audio_tokens_torch)
vocos_output = vocos.decode(
    features, bandwidth_id=torch.tensor([2], device=device)
)  # 6 kbps
# Upsample to 44100 Hz for better reproduction on audio hardware
vocos_output = torchaudio.functional.resample(
    vocos_output, orig_freq=24000, new_freq=44100
).cpu()
Audio(vocos_output.numpy(), rate=44100)

In [10]:
whisper_model.transcribe(vocos_output)

AttributeError: 'list' object has no attribute 'compression_ratio'

## Now let's combine the analysis and take generation

In [None]:
from tqdm.notebook import trange, tqdm
from bark import SAMPLE_RATE, api, preload_models
from IPython.display import Audio, display
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import librosa
import numpy as np
import whisper
import torch

In [None]:
## Preload models
model = whisper.load_model("small.en")
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False,
)

In [None]:
def compute_snr(audio_signal):
    """
    Compute the Signal-to-Noise Ratio (SNR) of an audio signal.

    Parameters:
    audio_signal (np.array): The audio signal.

    Returns:
    float: The SNR of the audio signal in decibels.
    """

    # Compute the Fast Fourier Transform (FFT)
    fft = np.fft.fft(audio_signal)

    # Compute the Power Spectral Density (PSD)
    psd = np.abs(fft) ** 2

    # Define the threshold as the mean power
    threshold = np.mean(psd)

    # Separate the signal and the noise
    signal_psd = psd[psd > threshold]
    noise_psd = psd[psd <= threshold]

    # Compute the Signal-to-Noise Ratio (SNR)
    snr = 10 * np.log10(np.mean(signal_psd) / np.mean(noise_psd))

    return snr


def compute_similarity(sentence1, sentence2):
    # Load the BERT model
    model = SentenceTransformer("bert-base-nli-mean-tokens")

    # Compute the embeddings for the sentences
    embeddings = model.encode([sentence1, sentence2])

    # Compute the cosine similarity between the embeddings
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    return similarity * 100

In [None]:
def generate_speech(sentence, history_prompt):
    audio = api.generate_audio(
        sentence,
        silent=True,
        text_temp=0.7,
        waveform_temp=0.7,
        history_prompt=history_prompt,
        output_full=False,
    )

    transcribed = model.transcribe(torch.Tensor(audio))["text"]
    print(f"Transcribed: {transcribed}")
    ## Test similarity
    transcribed_similarity = compute_similarity(
        transcribed, sentence.replace("[laughs]", "")
    )
    snr = compute_snr(audio)
    print(f"snr: {snr} similarity: {transcribed_similarity}")
    if (snr < 20) or (transcribed_similarity < 85):
        # Audio is noisy and or not clear
        print("We think it's bad, but here it is anyway")
        display(Audio(audio, rate=SAMPLE_RATE))
        return audio, True

    return audio, False

In [None]:
retries = 20
while retries > 0:
    audio, is_bad = generate_speech("Hey there Heather [laughs]", None)
    if not is_bad:
        break
    print("Doing another take")
    retries -= 1
if (retries == 0) and is_bad:
    print("Still bad but exited anyway")
print("Finally got something")
Audio(audio, rate=SAMPLE_RATE)