In [56]:
import os
import math
import numpy as np
import torch
import torchaudio
from torch.nn.functional import softmax

from IPython.display import Audio
from scipy.io.wavfile import write as write_wav
# from playsound import playsound

from transformers import BertTokenizer

import bark
from bark import SAMPLE_RATE, generate_audio
from bark.generation import (load_codec_model, generate_text_semantic, generate_coarse, generate_fine, codec_decode,
    load_model)
from encodec.utils import convert_audio

def hear_fine(fine_tokens):
    from bark.generation import SAMPLE_RATE
    write_wav("/dev/shm/autoplayme.wav", SAMPLE_RATE, codec_decode(fine_tokens))

def hear_coarse(coarse_tokens):
    hear_fine(generate_fine(coarse_tokens))

def hear_semantic(semantic_tokens):
    hear_coarse(generate_coarse(semantic_tokens))

In [17]:
def hear_text(text, history_prompt=None, sample_count=1):
    audio_array = np.array([], dtype=float)
    for n in range(sample_count):
        audio_array = np.hstack([audio_array, generate_audio(text, history_prompt=history_prompt)])
    write_wav(f"/dev/shm/autoplayme.wav", SAMPLE_RATE, audio_array)

In [3]:
def save_history(semantic, codes, file_path):
    if codes.shape[0] == 8: # Have fine codes
        coarse_codes = codes[:2, :]
        fine_codes = codes
    elif codes.shape[0] == 2: # Have coarse codes only
        coarse_codes = codes
        fine_codes = generate_fine(coarse_codes)
    else:
        raise Exception("Must provide a set of coarse or fine audio tokens to save")
    np.savez(file_path, fine_prompt=fine_codes, coarse_prompt=coarse_codes, semantic_prompt=semantic)

In [68]:
def custom_gen_semantic(text, duration, sem_history=np.array([]), logit_mods=None, top_k=50, temp=1.0, use_gpu=True):
    """ Custom inference method that can generate semantic tokens of a fixed duration and also return logits. """
    batch_size = 1 # TODO: Implement batch sizes larger than 1
    from bark.generation import (TEXT_ENCODING_OFFSET, TEXT_PAD_TOKEN, SEMANTIC_PAD_TOKEN, SEMANTIC_INFER_TOKEN,
        SEMANTIC_VOCAB_SIZE, SEMANTIC_RATE_HZ)
    
    # Sets an np array to a given length by either inserting padding or truncating
    def set_len(arr, length, pad_value):
        if len(arr) >= length:
            return arr[:length]
        else:
            return np.pad(arr, (0, length - len(arr)), mode="constant", constant_values=pad_value)
    
    text = bark.generation._normalize_whitespace(text)
    assert len(text.strip()) > 0
    
    model_container = load_model(use_gpu=use_gpu, model_type="text")
    model = model_container["model"]
    tokenizer = model_container["tokenizer"]
    device = next(model.parameters()).device
    
    tokenized = np.array(bark.generation._tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
    x = np.hstack([set_len(tokenized, 256, TEXT_PAD_TOKEN),
                   set_len(sem_history, 256, SEMANTIC_PAD_TOKEN), # Semantic history
                   np.array([SEMANTIC_INFER_TOKEN])])
    x = torch.stack([torch.from_numpy(x.astype(np.int64))] * batch_size)
    
    # x is a batch of 1. Its contents are 256 text tokens, then 256 semantic tokens (history), then the semantic
    # infer token
    prompt_len = 256 + 256 + 1
    assert x.shape == (batch_size, prompt_len)
    
    kv_cache = None
    with torch.inference_mode(), bark.generation.autocast():
        x = x.to(device)
        sem_toks_to_gen = max(1, math.ceil(duration * SEMANTIC_RATE_HZ))
        assert (logit_mods is None) or (len(logit_mods) == sem_toks_to_gen)
        logits_per_tok = []
        for n in range(sem_toks_to_gen):
            # print(f"Sem token {n+1}/{sem_toks_to_gen}")
            
            x_in = x[:, [-1]] if kv_cache is not None else x
            logits, kv_cache = model(x_in, merge_context=True, use_cache=True, past_kv=kv_cache)
            sem_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
            logits_per_tok.append(sem_logits)
            
            if logit_mods is not None:
                sem_logits += logit_mods[n]

            _, top_indices = torch.topk(sem_logits, top_k)
            top_k_mask = torch.full(sem_logits.shape, -math.inf).to(device)
            top_k_mask[top_indices] = 0
            probs = softmax((sem_logits + top_k_mask) / temp, dim=-1)
            y = torch.multinomial(probs, num_samples=1)
            x = torch.cat((x, y.unsqueeze(0)), dim=1)
        
        sem = x.detach().cpu().numpy().squeeze()[prompt_len:]
        
    return sem, logits_per_tok

def custom_gen_coarse(semantic, use_gpu=True):
    sliding_window_len = 60
    
    from bark.generation import (COARSE_RATE_HZ, SEMANTIC_RATE_HZ, N_COARSE_CODEBOOKS)
    assert semantic.min() >= 0 and semantic.max() < SEMANTIC_VOCAB_SIZE
    
    # No. of coarse tokens generated per semantic token input
    coarse_per_semantic_tok = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
    
    # Load model
    model = load_model(use_gpu=use_gpu, model_type="coarse")
    device = next(model.parameters()).device
    
    # Placeholder history data for now
    semantic_history = np.array([], dtype=np.int32)
    coarse_history = np.array([], dtype=np.int32)
    
    # Number of coarse tokens to generate
    count_to_gen = int(np.floor(len(x) * coarse_per_semantic_tok / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS)
    assert count_to_gen > 0
    
    count_gened = 0 # Number generated
    
    x_semantic = np.hstack([semantic_history, semantic]).astype(np.int32)
    first_new_sem_index = len(semantic_history) # Index of first token of new semantic input, after history
    x_coarse = coarse_history.astype(np.int32)
    with torch.inference_mode(), bark.generation.autocast():
        x_semantic = torch.from_numpy(x_semantic).unsqueeze(0).to(device)
        x_coarse   = torch.from_numpy(x_coarse)  .unsqueeze(0).to(device)
        count_window_steps = math.ceil(count_to_gen / sliding_window_len)
        for _ in range(count_window_steps):
            sem_index = first_new_sem_index + round(count_gened / coarse_per_semantic_tok)
            
            x_in = x_semantic[:, max(0, sem_index - max_semantic_history) :]
            x_in = x_in[:, :256]
        
            logits, _ = model(x)

class CoarseToSemantic:
    def __init__(self, text, target_coarse, duration, use_gpu=True):
        self.text = text
        self.target_coarse = target_coarse
        self.duration = duration
        self.use_gpu = use_gpu
        
        self.logit_mod_decay = 0.99
        self.learning_rate = 100
        
        self.sems = []
        self.logit_mods = None
        self.sim_ema = None
        
        self.best_sem = None
        self.best_coarse = None
        self.best_sem_sim = -math.inf

    def run(self, step_count, silent=False):
        for _ in range(step_count):
            
            # Sample semantic and coarse tokens
            sem, logits_per_tok = custom_gen_semantic(self.text, self.duration, logit_mods=self.logit_mods, use_gpu=self.use_gpu)
            sampled_coarse = generate_coarse(sem, use_gpu=self.use_gpu, silent=True)
            assert sampled_coarse.shape == self.target_coarse.shape
            
            # Compute similarity and update EMA
            similarity = (sampled_coarse == self.target_coarse).sum() / (sampled_coarse.shape[0] * sampled_coarse.shape[1])
            if self.sim_ema is None:
                self.sim_ema = similarity
            else:
                self.sim_ema = self.sim_ema * 0.9 + similarity * 0.1
            if similarity > self.best_sem_sim:
                self.best_sem = sem
                self.best_coarse = sampled_coarse
                self.best_sem_sim = similarity
            
            # Update logit mods
            if self.logit_mods is None:
                self.logit_mods = [torch.zeros_like(logits_per_tok[0]) for n in range(len(logits_per_tok))]
            for sem_pos in range(len(self.logit_mods)):
                sem_tok = sem[sem_pos]
                self.logit_mods[sem_pos] *= self.logit_mod_decay
                self.logit_mods[sem_pos][sem_tok] += self.learning_rate * (similarity - self.sim_ema)
            
            if not silent:
                print(f"{len(self.sems)+1}. {round(100 * similarity, 3)}%")
            
            self.sems.append(sem)
        
        return self.sems


In [67]:
from bark.generation import (COARSE_RATE_HZ, SEMANTIC_RATE_HZ, N_COARSE_CODEBOOKS)
coarse_per_semantic_tok = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
count_to_gen = int(np.floor(55 * coarse_per_semantic_tok / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS)
for n in range(15):
    print(round(n / coarse_per_semantic_tok))

0
0
1
1
1
2
2
2
3
3
3
4
4
4
5


In [48]:
hear_fine(en_speaker_3["fine_prompt"])

In [49]:
hear_text("Do not be alarmed, this is just a test.", "en_speaker_3")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 18.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:15<00:00,  1.21it/s]


In [18]:
hear_text("Testing 1 2 3")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 16.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:17<00:00,  1.17it/s]


In [136]:
sem, logits_per_tok = custom_gen_semantic("This is a test.", 1.5)
test_coarse = generate_coarse(sem)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.38it/s]


In [None]:
# c2s = CoarseToSemantic(sample_text, sample_coarse, sample_duration)
sems = c2s.run(150)

In [26]:
len(c2s.sems)

511

In [24]:
sem, _ = custom_gen_semantic("This is a test.", 1.5)
hear_semantic(sem)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.38it/s]


In [10]:
save_history(c2s.best_sem, sample_codes, "/dev/shm/lara-test.npz")

In [28]:
hear_text("Testing 1 2 3", history_prompt="/dev/shm/lara-test.npz", sample_count=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 31.71it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.03it/s]


In [253]:
def sample_sem(logits):
    probs = softmax(logits, dim=-1)
    return torch.multinomial(probs, num_samples=1)

# logits0 = torch.ones((SEMANTIC_VOCAB_SIZE,)) / SEMANTIC_VOCAB_SIZE

x0 = x0.to(device)

# x0 = ...
logits0 = model(x0, merge_context=True)[0, 0, :SEMANTIC_VOCAB_SIZE]
logits0.retain_grad()
x1 = torch.cat((x0, sample_sem(logits0).unsqueeze(0)), dim=1)
logits1 = model(x1, merge_context=True)[0, 0, :SEMANTIC_VOCAB_SIZE]
logits1.retain_grad()
x2 = torch.cat((x1, sample_sem(logits1).unsqueeze(0)), dim=1)
x2.requires_grad = True
x2.retain_grad()
logits2 = model(x2, merge_context=True)[0, 0, :SEMANTIC_VOCAB_SIZE]
logits2.retain_grad()
# and so forth...

RuntimeError: only Tensors of floating point and complex dtype can require gradients

In [125]:
arr = generate_audio("What would you do with that kind of power?", history_prompt="en_speaker_3")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:16<00:00,  1.20it/s]


In [151]:
arr2 = generate_audio("Do not be alarmed. This is only a test.", history_prompt="en_speaker_3")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:09<00:00, 10.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37/37 [00:32<00:00,  1.15it/s]


In [94]:
arr3 = generate_audio("Are you hearing me? Like literally, are do you hear what I'm saying?", history_prompt="en_speaker_3")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 34.13it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:09<00:00,  1.19it/s]


In [103]:
arr4 = generate_audio("I don't think this will work but I'll give it one last try.", history_prompt="en_speaker_3")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 23.23it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:14<00:00,  1.20it/s]


In [152]:
write_wav("/dev/shm/autoplayme.wav", SAMPLE_RATE, arr2)

In [69]:
import os
en_speaker_3 = np.load(os.path.join(os.getcwd(), "bark", "assets", "prompts", "en_speaker_3.npz"))
hear_fine(en_speaker_3["fine_prompt"])

In [71]:
codec_model = load_codec_model(use_gpu=True)
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [72]:
audio_filepath = os.path.join(os.getcwd(), "misc/lara-silver-box.wav")
sample_text = "Dominguez said that he would use the silver box of Ix Chel to remake the world."
sample_voice_name = "nu-lara-v6"

device = "cuda"
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, codec_model.sample_rate, codec_model.channels)
wav = wav.unsqueeze(0).to(device)

# Encode audio sample with EnCodec
with torch.no_grad():
    encoded_frames = codec_model.encode(wav)
sample_codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]
sample_codes = sample_codes.cpu().numpy()
sample_coarse = sample_codes[:2, :]
sample_duration = wav.shape[-1] / codec_model.sample_rate

## First try using genetic algorithms. Doesn't work.

In [20]:
# generate semantic tokens
semantic_tokens = generate_text_semantic(transcription, max_gen_duration_s=seconds, top_k=40, top_p=0.8, temp=1.0)
# INSTEAD:
# semantic_tokens = inverse_generate_semantic_tokens(model, coarse, transcription)

output_path = "bark/assets/prompts/" + voice_name + ".npz"
np.savez(output_path, fine_prompt=codes, coarse_prompt=coarse, semantic_prompt=semantic_tokens)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.57it/s]


In [None]:
def inverse_generate_semantic_tokens(x_coarse):
    """Generate semantic tokens from coarse audio codes given some text."""
    # TODO: This is what would be needed for proper voice cloning
    pass

In [65]:
from bark.generation import load_model, generate_coarse, generate_fine

In [57]:
model_container = load_model(use_gpu=True, model_type="text")
sem_model = model_container["model"]
tokenizer = model_container["tokenizer"]

In [70]:
coarse_model = load_model(use_gpu=True, model_type="coarse")

In [401]:
def fit_coarse_and_semantic_len(text, coarse, duration):
    """ Fits together the lengths of a text prompt, a set of coarse tokens, and an audio duration in secs.
    Returns a length for a semantic token seq that will produce a coarse seq of the appropriate length and the
    set of coarse tokens, possibly truncated by one step in length. """
    from math import ceil
    from bark.generation import SEMANTIC_RATE_HZ, COARSE_RATE_HZ, N_COARSE_CODEBOOKS
    semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
    target_sem_len = ceil(duration * SEMANTIC_RATE_HZ) # Req number of semantic tokens
    target_coarse_width = int(np.floor(target_sem_len * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS)) # No. of cols in coarse translation of target_sem_len semantic tokens
    if coarse.shape[1] > target_coarse_width:
        coarse = coarse[:,:-1] # Drop last column
    assert target_coarse_width == coarse.shape[1]    
    return target_sem_len, coarse

def generate_semantic_variations(text, target_coarse, duration, count, history_prompt=None, silent=False):
    """ Randomly generates sets of semantic tokens from a text prompt with a "target" coarse encoding. The sets
    will be length-matched to the target and compared with it. Returns a list of "count" (semantic, coarse,
    similarity) triples. """
    target_sem_len, target_coarse = fit_coarse_and_semantic_len(text, target_coarse, duration)
    tr = []
    attempt = 1
    while len(tr) < count:
        if not silent:
            print(f"Attempt {attempt}/?; Have {len(tr)}/{count} variations")
        sem = generate_text_semantic(text, history_prompt, max_gen_duration_s=duration, top_k=75, top_p=None, temp=1.5, silent=True)
        if sem.shape[0] == target_sem_len:
            coarse = generate_coarse(sem, history_prompt, silent=True)
            assert coarse.shape == target_coarse.shape
            sim = (coarse == target_coarse).sum() # Similarity = sum of matching tokens
            tr.append((sem, coarse, sim))
        elif not silent:
            print(f"\tSem length {sem.shape[0]} unusable, need {target_sem_len}")
        attempt += 1
    if not silent:
        print("Done")
    return tr

In [325]:
voices = generate_semantic_variations(transcription, coarse, seconds, 2)

Attempt 1/?; Have 0/2 variations
Attempt 2/?; Have 1/2 variations
Done


In [132]:
with torch.no_grad():
    encoded_frames = model.encode(wav)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]

coarse = codes[:2, :]

round(seconds * SEMANTIC_RATE_HZ) # This is how many semantic tokens we want

350

In [135]:
coarse = coarse[:, :526]
coarse = coarse.cpu().numpy()

In [183]:
pop = [(voices[n][0], voices[n][1], sims[n]) for n in range(len(voices))]
pop.sort(key = lambda p: -p[2]) # Sort most -> least similar

In [171]:
hear_coarse(more_voices[1][1])

In [403]:
def evolve(text, target_coarse, duration, initial_pop=[], target_sim=0.5, history_prompt=None, max_gen_count=100):
    pop_size = 50
    new_fraction = 0.70 # Fraction of the population that is new each generation
    
    _, target_coarse = fit_coarse_and_semantic_len(text, target_coarse, duration)
    coarse_count = target_coarse.shape[0] * target_coarse.shape[1]
    
    def get_sim_percent(voice):
        return round(100 * voice[2] / coarse_count, 2)
    
    def get_avg_sim_percent(voice_list):
        sims = [get_sim_percent(v) for v in voice_list]
        return round(sum(sims) / len(sims), 2)
    
    pop = initial_pop
    
    # Generate initial population
    needed_initial = pop_size - len(initial_pop)
    if needed_initial > 0:
        print(f"Generating {needed_initial} voices for initial population")
        pop.extend(generate_semantic_variations(text, target_coarse, duration, needed_initial, history_prompt, silent=True))
                   
    pop.sort(key = lambda p: -p[2])
    best = pop[0]
    print(f"Initial pop avg similarity: {get_avg_sim_percent(pop)}%, best similarity: {get_sim_percent(best)}%")
    
    gen_count = 1
    while (best[2] / coarse_count < target_sim) and (gen_count <= max_gen_count):
        print(f"Generation {gen_count}:")
        
        # print("\tCombining old voices...")
        weights = np.array([p[2] for p in pop], dtype=float)
        weights /= weights.sum()
        new_pop = [best]
        while len(new_pop) < round(pop_size * (1.0 - new_fraction)):
            # Randomly combine two voices to produce a new set of semantic tokens
            (i_dad, i_mom) = np.random.choice(np.arange(pop_size), size=2, replace=False, p=weights)
            dad = pop[int(i_dad)]
            mom = pop[int(i_mom)]
            prob = dad[2] / (dad[2] + mom[2])
            combo = (np.random.rand(*dad[0].shape) < prob).astype(float)
            new_sem = dad[0] * combo + mom[0] * (1.0 - combo)
            
            new_coarse = generate_coarse(new_sem, silent=True)
            assert new_coarse.shape == target_coarse.shape
            new_sim = (new_coarse == target_coarse).sum()
            new_pop.append((new_sem, new_coarse, new_sim))
                
        print(f"\tCombinations have avg of {get_avg_sim_percent(pop[1:])}% similarity")
        
        # print("\tGenerating new voices...")
        new_voices = generate_semantic_variations(text, target_coarse, duration, pop_size - len(new_pop), history_prompt, silent=True)
        print(f"\tNew voices have avg of {get_avg_sim_percent(new_voices)}% similarity")
        new_pop.extend(new_voices)
        
        pop = new_pop
        pop.sort(key = lambda p: -p[2])
        best = pop[0]
        print(f"\tBest voice this gen has {get_sim_percent(best)}% similarity")
        
        gen_count += 1
    
    return pop

In [404]:
the_his = "bark/assets/prompts/lara-the.npz"
pop = evolve(transcription, coarse, seconds, pop, target_sim=0.20, history_prompt=the_his, max_gen_count=50)

Initial pop avg similarity: 3.5%, best similarity: 13.24%
Generation 1:
	Combinations have avg of 3.3% similarity
	New voices have avg of 4.03% similarity
	Best voice this gen has 13.24% similarity
Generation 2:
	Combinations have avg of 3.81% similarity
	New voices have avg of 2.52% similarity
	Best voice this gen has 13.24% similarity
Generation 3:
	Combinations have avg of 2.76% similarity
	New voices have avg of 3.99% similarity
	Best voice this gen has 13.24% similarity
Generation 4:
	Combinations have avg of 3.72% similarity
	New voices have avg of 3.95% similarity
	Best voice this gen has 13.24% similarity
Generation 5:
	Combinations have avg of 3.48% similarity
	New voices have avg of 3.7% similarity
	Best voice this gen has 16.18% similarity
Generation 6:
	Combinations have avg of 3.48% similarity
	New voices have avg of 4.16% similarity
	Best voice this gen has 16.18% similarity
Generation 7:
	Combinations have avg of 3.48% similarity
	New voices have avg of 2.56% similarity


In [344]:
output_path = "bark/assets/prompts/lara-the.npz"
np.savez(output_path, fine_prompt=codes, coarse_prompt=coarse, semantic_prompt=pop[0][0])

In [346]:
the_best = pop[0]
the_pop = pop

In [433]:
semantic_to_coarse_ratio

3.006012024048096