Load the model

In [10]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="mps",
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.31s/it]


Load the trained CNN

In [25]:
import torch
from cnn import Cnn
import torchvision.transforms as transforms

model = Cnn(targets=2, in_size=(32, 32, 1))
model.load_state_dict(torch.load("./models_og/cnn.pt", weights_only=False))
model.to("mps")
model.eval()
pass

FileNotFoundError: [Errno 2] No such file or directory: './models_og/cnn.pt'

This is just loading an example input img

In [12]:
import pickle as pkl
import numpy as np
import random


#change path to data to dataog
with open("./data_og/raw/cfms.pkl", "rb") as file_:
    cfms = pkl.load(file_)

with open("./data_og/raw/labels.pkl", "rb") as file_:
    labels = pkl.load(file_)

i = random.randint(0, cfms.shape[0])
cfms_img = cfms[i].astype(np.float32)
label = labels[i]  # Valence first then Arausal

Load the transforms (must be the same as in cnn.py)

In [13]:
transform = transforms.Compose(
    [
        transforms.ToTensor(),  # Convert images to tensor
        transforms.Resize(
            (32, 32)
        ),  # Make the CFMS a bit larger (maybe check this as a hparam)
        # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), May reenable to improve results
    ]  # Normalize the images
)

Run the cfms through the model to get the val and the arousal

In [14]:
img = transform(cfms_img).unsqueeze(0).to("mps")  # Unsqueeze bc we want b,c,w,h
pred = model(img)

In [15]:
pred

tensor([[2.2871, 3.5091]], device='mps:0', grad_fn=<LinearBackward0>)

Do the prompt engineering

In [16]:
messages = [
    {
        "role": "system",
        "content": "You are a poem writer that writes poems based on the values level of Valence and Arousal. You will interpret these values and tie an emotion to them, based on that emotion you will write a poem with the emotion in the title.",
        # "content": "You are a poem writer that writes poems based on the emotion given by EEG analysis in the form of valence and arousal values. Set the emotion as the title. Dont mention the valence and arousal values.",
        # "content": "I have the valence and arousal of a person obtained by EEG data in order to detect the emotion of the person. I will give it to you and you are going to create a personalized poem based on the emotion.",
        # "content": "You are a poem writer that writes poems based on the valence and arousal values of the analysis of EEG data. Start by reporting the emotion of the",
    },
    {
        "role": "user",
        "content": f"The Valence: {pred[0][0].item()}, The Arousal: {pred[0][1].item()}",
    },
]

Run it through the pipeline and write the poem

In [17]:
outputs = pipe(
    messages,
    max_new_tokens=512,
)
with open("./poem.txt", "w") as file_:
    file_.write(outputs[0]["generated_text"][-1]["content"])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


## TTS Algo

In [18]:
from transformers import AutoProcessor, AutoModel

processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

inputs = processor(
    text=[
        "[clears throat] **'Serenity's Warmth'**\n\nIn tranquil twilight, where shadows play,\nI find my peace, my heart's sweet way.\nA sense of calm, a feeling so divine,\nWashes over me, like a gentle wine.\n\nThe world may be loud, with chaos and strife,\nBut in this moment, I am free from life.\nMy soul is soothed, my heart is light,\nAs I bask in serenity's warm, golden light.\n\nIn this peaceful state, I am complete,\nMy spirit soars, my heart skips a beat.\nI am one with the world, yet apart,\n"
    ],
    return_tensors="pt",
)

speech_values = model.generate(**inputs, do_sample=True)

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [19]:
from IPython.display import Audio

sampling_rate = model.generation_config.sample_rate
Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)

Test the other version of the algo (might be quicker)

In [20]:
from bark import SAMPLE_RATE, generate_audio, preload_models
from IPython.display import Audio
import os

os.environ["SUNO_ENABLE_MPS"] = "True"
preload_models()

No GPU being used. Careful, inference might be very slow!
  checkpoint = torch.load(ckpt_path, map_location=device)
  WeightNorm.apply(module, name, dim)


In [21]:
text_prompt = "[clears throat] **'Serenity's Warmth'**\n\nIn tranquil twilight, where shadows play,\nI find my peace, my heart's sweet way.\nA sense of calm, a feeling so divine,\nWashes over me, like a gentle wine.\n\nThe world may be loud, with chaos and strife,\nBut in this moment, I am free from life.\nMy soul is soothed, my heart is light,\nAs I bask in serenity's warm, golden light.\n\nIn this peaceful state, I am complete,\nMy spirit soars, my heart skips a beat.\nI am one with the world, yet apart,\n"

In [22]:
from IPython.display import Audio

audio_array = generate_audio(text_prompt)
Audio(audio_array, rate=SAMPLE_RATE)

100%|██████████| 670/670 [00:47<00:00, 13.99it/s]
100%|██████████| 34/34 [02:42<00:00,  4.79s/it]


Way to write the sample to disk

In [23]:
import scipy

sampling_rate = model.generation_config.sample_rate
scipy.io.wavfile.write(
    "bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze()
)