In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
from model import LlamaTTS
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from transformers import AutoConfig, AutoTokenizer
from transformers import DacModel, AutoProcessor
from datasets import Audio
from glob import glob
import torch
import torch.nn as nn


dac = DacModel.from_pretrained("descript/dac_44khz")
processor = AutoProcessor.from_pretrained("descript/dac_44khz")
audio = Audio(processor.sampling_rate)
config = AutoConfig.from_pretrained('HuggingFaceTB/SmolLM2-135M-Instruct')
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceTB/SmolLM2-135M-Instruct')

In [3]:
model = LlamaTTS.from_pretrained('HuggingFaceTB/SmolLM2-135M-Instruct', torch_dtype = torch.bfloat16).cuda()

Some weights of LlamaTTS were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['codebook_heads.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model.resize_token_embeddings(len(tokenizer) + model.config.codebook_size + 2, mean_resizing = False)

Embedding(50178, 576, padding_idx=2)

In [5]:
def get_audio(f):
    return audio.decode_example(audio.encode_example(f))['array']

files = glob('/home/husein/ssd3/sg-podcast_processed/**/*.mp3', recursive = True)
len(files)

61180

In [6]:
a = get_audio(files[0])

In [7]:
inputs = processor(raw_audio=a, sampling_rate=processor.sampling_rate, return_tensors="pt")

In [8]:
with torch.no_grad():
    encoder_outputs = dac.encode(inputs["input_values"]).audio_codes + len(tokenizer) + 2
input_ids = tokenizer('helo testing', return_tensors = 'pt').input_ids
input_ids = input_ids.unsqueeze(1).repeat((1, model.config.num_codebooks, 1))
speech_start = torch.full((1, model.config.num_codebooks, 1), len(tokenizer))
speech_end = torch.full((1, model.config.num_codebooks, 1), len(tokenizer) + 1)
input_ids = torch.concat(
    [encoder_outputs, input_ids, speech_start, encoder_outputs], dim = -1
)
input_ids.shape

torch.Size([1, 9, 666])

In [9]:
from parler_tts import build_delay_pattern_mask, apply_delay_pattern_mask

In [10]:
input_ids, delay_pattern_mask = build_delay_pattern_mask(
    input_ids[0],
    bos_token_id=len(tokenizer),
    pad_token_id=len(tokenizer) + 1,
    max_length=input_ids.shape[-1] + model.num_codebooks,
    num_codebooks=model.num_codebooks,
)
input_ids = torch.where(delay_pattern_mask == -1, len(tokenizer) + 1, delay_pattern_mask)
input_ids = input_ids[:, 1:].unsqueeze(0).cuda()
input_ids

tensor([[[49852, 49484, 49957,  ..., 49153, 49153, 49153],
         [49152, 49641, 49788,  ..., 49153, 49153, 49153],
         [49152, 49152, 49918,  ..., 49153, 49153, 49153],
         ...,
         [49152, 49152, 49152,  ..., 49153, 49153, 49153],
         [49152, 49152, 49152,  ..., 50000, 49153, 49153],
         [49152, 49152, 49152,  ..., 49241, 49471, 49153]]], device='cuda:0')

In [11]:
labels = input_ids.masked_fill(input_ids == len(tokenizer), -100)
labels

tensor([[[49852, 49484, 49957,  ..., 49153, 49153, 49153],
         [ -100, 49641, 49788,  ..., 49153, 49153, 49153],
         [ -100,  -100, 49918,  ..., 49153, 49153, 49153],
         ...,
         [ -100,  -100,  -100,  ..., 49153, 49153, 49153],
         [ -100,  -100,  -100,  ..., 50000, 49153, 49153],
         [ -100,  -100,  -100,  ..., 49241, 49471, 49153]]], device='cuda:0')

In [12]:
delay_pattern_mask

tensor([[49152, 49852, 49484,  ..., 49153, 49153, 49153],
        [49152, 49152, 49641,  ..., 49153, 49153, 49153],
        [49152, 49152, 49152,  ..., 49153, 49153, 49153],
        ...,
        [49152, 49152, 49152,  ...,    -1, 49153, 49153],
        [49152, 49152, 49152,  ..., 50000,    -1, 49153],
        [49152, 49152, 49152,  ..., 49241, 49471,    -1]])

In [13]:
model(input_ids, labels = labels)

tensor(3.1851, device='cuda:0', grad_fn=<DivBackward0>)