In [10]:
import pickle
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from datasets import load_dataset
from torch.utils.data import DataLoader
from datasets import Dataset
from sentence_transformers import SentenceTransformer

In [2]:
device = torch.device('cuda')

chord_tokenizer = AutoTokenizer.from_pretrained("jammai/chocolm-modernbert-base")
chord_model = AutoModel.from_pretrained("jammai/chocolm-modernbert-base")
chord_model.to(device)

text_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
text_model = AutoModel.from_pretrained("xlm-roberta-base", attn_implementation="eager")
text_model.to(device)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [3]:
chords_lyrics = load_dataset("jammai/chords_and_lyrics")

In [7]:
chords_lyrics

DatasetDict({
    train: Dataset({
        features: ['artist_name', 'song_name', 'verse_to_harte_chords', 'verse_to_lyrics'],
        num_rows: 135783
    })
})

In [8]:
def extract_text_chords_pairs(dataset):
    result_chords = []
    result_lyrics = []
    artist_song = []
    for song in tqdm(dataset):
        lyrics = eval(song["verse_to_lyrics"])
        chords = eval(song["verse_to_harte_chords"])
        for verse in chords:
            if verse + 1 in lyrics and len(lyrics[verse + 1].rstrip()):
                result_chords.append(" ".join(chords[verse]))
                result_lyrics.append(lyrics[verse + 1])
                artist_song.append({"artist": song["artist_name"], "song": song["song_name"]})
    return result_lyrics, result_chords, artist_song


def compute_embeddings_ls_hs(data_loader, tokenizer, model):
    embeddings_ls, embeddings_hs = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            ls, hs = compute_embedding(tokenizer, model, batch, output_embending_from_hidden_states=True)
            embeddings_ls.append(ls)
            embeddings_hs.append(hs)
    return embeddings_ls, embeddings_hs


def apply_attention(attention_mask, model_state):
    
    #return torch.vstack([
    #        (model_state[i][torch.nonzero(attention_mask[i])]).transpose(0,1).sum(dim=1)
    #        for i in range(len(model_state))]
    #    )
    return torch.diagonal( attention_mask.to(torch.float32) @ model_state ).transpose(0,1)

def compute_embedding(tokenizer, model, input, device="cuda", output_embending_from_hidden_states=False):
    
    tokenized = tokenizer(input, return_tensors="pt", padding=True)
    tokenized = tokenized.to(device)
    model_output = model(**tokenized, output_hidden_states=True)

    embedding = apply_attention(tokenized.attention_mask, model_output.last_hidden_state)
    embedding = embedding.to("cpu")

    if output_embending_from_hidden_states:
        hidden_states = [
            apply_attention(tokenized.attention_mask, model_output.hidden_states[n_layer]).unsqueeze(0) 
            for n_layer in  range(len(model_output.hidden_states))
        ]
        hidden_states = torch.vstack(hidden_states).mean(dim = 0)
        embedding_hs = hidden_states.to("cpu")
    else:
        return embedding

    return embedding, embedding_hs

In [9]:
lyrics, chords, artist_song = extract_text_chords_pairs(chords_lyrics["train"])

100%|██████████| 135783/135783 [00:36<00:00, 3728.17it/s]


In [11]:
sbert_roberta_model = SentenceTransformer('xlm-roberta-base')
sbert_chocolm_model = SentenceTransformer('jammai/chocolm-modernbert-base')

No sentence-transformers model found with name xlm-roberta-base. Creating a new one with mean pooling.
No sentence-transformers model found with name jammai/chocolm-modernbert-base. Creating a new one with mean pooling.


In [13]:
chords_embeddings_sbert_chocolm = sbert_chocolm_model.encode(chords, show_progress_bar=True)

Batches: 100%|██████████| 109914/109914 [11:57<00:00, 153.22it/s]


In [14]:
lyrics_embeddings_sbert_roberta = sbert_roberta_model.encode(lyrics, show_progress_bar=True)

Batches: 100%|██████████| 109914/109914 [14:53<00:00, 123.06it/s]


In [16]:
chords_embeddings_sbert_chocolm = torch.from_numpy(chords_embeddings_sbert_chocolm)
lyrics_embeddings_sbert_roberta = torch.from_numpy(lyrics_embeddings_sbert_roberta)

In [17]:
pickle.dump(chords_embeddings_sbert_chocolm, open("data/chords_embeddings_sbert_chocolm.pkl", "wb"))
pickle.dump(lyrics_embeddings_sbert_roberta, open("data/lyrics_embeddings_sbert_roberta.pkl", "wb"))
pickle.dump(lyrics, open("data/lyrics.pkl", "wb"))
pickle.dump(chords, open("data/chords.pkl", "wb"))
pickle.dump(artist_song, open("data/artist_song.pkl", "wb"))

In [7]:
ds = chords_lyrics["train"].train_test_split(test_size=0.2)

In [8]:
train_lyrics, train_chords = extract_text_chords_pairs(ds["train"])
test_lyrics, test_chords = extract_text_chords_pairs(ds["test"])

100%|██████████| 108626/108626 [00:31<00:00, 3411.72it/s]
100%|██████████| 27157/27157 [00:07<00:00, 3397.27it/s]


In [9]:
data_loader_train_lyrics = DataLoader(train_lyrics, batch_size=256)
data_loader_test_lyrics = DataLoader(test_lyrics, batch_size=256)

data_loader_train_chords = DataLoader(train_chords, batch_size=256)
data_loader_test_chords = DataLoader(test_chords, batch_size=256)

train_chords_embeddings_ls, train_chords_embeddings_hs = compute_embeddings_ls_hs(data_loader_train_chords, chord_tokenizer, chord_model)
test_chords_embeddings_ls, test_chords_embeddings_hs = compute_embeddings_ls_hs(data_loader_test_chords, chord_tokenizer, chord_model)

train_lyrics_embeddings_ls, train_lyrics_embeddings_hs = compute_embeddings_ls_hs(data_loader_train_lyrics, text_tokenizer, text_model)
test_lyrics_embeddings_ls, test_lyrics_embeddings_hs = compute_embeddings_ls_hs(data_loader_test_lyrics, text_tokenizer, text_model)

100%|██████████| 10995/10995 [02:08<00:00, 85.46it/s]
100%|██████████| 2745/2745 [00:32<00:00, 85.29it/s]
100%|██████████| 10995/10995 [14:44<00:00, 12.43it/s]
100%|██████████| 2745/2745 [03:39<00:00, 12.50it/s]


In [10]:
pickle.dump(torch.vstack(train_chords_embeddings_ls), open("experimental_data/embeddings/train_chords_embeddings_ls.pkl", "wb"))
pickle.dump(torch.vstack(train_chords_embeddings_hs), open("experimental_data/embeddings/train_chords_embeddings_hs.pkl", "wb"))
pickle.dump(torch.vstack(test_chords_embeddings_ls), open("experimental_data/embeddings/test_chords_embeddings_ls.pkl", "wb"))
pickle.dump(torch.vstack(test_chords_embeddings_hs), open("experimental_data/embeddings/test_chords_embeddings_hs.pkl", "wb"))

pickle.dump(torch.vstack(train_lyrics_embeddings_ls), open("experimental_data/embeddings/train_lyrics_embeddings_ls.pkl", "wb"))
pickle.dump(torch.vstack(train_lyrics_embeddings_hs), open("experimental_data/embeddings/train_lyrics_embeddings_hs.pkl", "wb"))
pickle.dump(torch.vstack(test_lyrics_embeddings_ls), open("experimental_data/embeddings/test_lyrics_embeddings_ls.pkl", "wb"))
pickle.dump(torch.vstack(test_lyrics_embeddings_hs), open("experimental_data/embeddings/test_lyrics_embeddings_hs.pkl", "wb"))

pickle.dump(train_lyrics, open("experimental_data/train_lyrics.pkl", "wb"))
pickle.dump(train_chords, open("experimental_data/train_chords.pkl", "wb"))
pickle.dump(test_lyrics, open("experimental_data/test_lyrics.pkl", "wb"))
pickle.dump(test_chords, open("experimental_data/test_chords.pkl", "wb"))

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')

In [12]:
train_lyrics_sbert_embeddings = model.encode(train_lyrics, show_progress_bar=True)
test_lyrics_sbert_embeddings = model.encode(test_lyrics, show_progress_bar=True)

Batches: 100%|██████████| 87955/87955 [07:04<00:00, 207.02it/s]
Batches: 100%|██████████| 21959/21959 [01:43<00:00, 212.59it/s]


In [14]:
train_lyrics_sbert_embeddings = torch.from_numpy(train_lyrics_sbert_embeddings)
test_lyrics_sbert_embeddings = torch.from_numpy(test_lyrics_sbert_embeddings)

In [15]:
pickle.dump(train_lyrics_sbert_embeddings, open("experimental_data/embeddings/train_lyrics_sbert_embeddings.pkl", "wb"))
pickle.dump(test_lyrics_sbert_embeddings, open("experimental_data/embeddings/test_lyrics_sbert_embeddings.pkl", "wb"))

In [17]:
model = SentenceTransformer('xlm-roberta-base')

No sentence-transformers model found with name xlm-roberta-base. Creating a new one with mean pooling.


In [18]:
train_lyrics_roberta_sbert_embeddings = model.encode(train_lyrics, show_progress_bar=True)
test_lyrics_roberta_sbert_embeddings = model.encode(test_lyrics, show_progress_bar=True)

Batches: 100%|██████████| 87955/87955 [11:49<00:00, 123.98it/s]
Batches: 100%|██████████| 21959/21959 [02:56<00:00, 124.28it/s]


In [19]:
train_lyrics_roberta_sbert_embeddings = torch.from_numpy(train_lyrics_roberta_sbert_embeddings)
test_lyrics_roberta_sbert_embeddings = torch.from_numpy(test_lyrics_roberta_sbert_embeddings)

In [20]:
pickle.dump(train_lyrics_roberta_sbert_embeddings, open("experimental_data/embeddings/train_lyrics_roberta_sbert_embeddings.pkl", "wb"))
pickle.dump(test_lyrics_roberta_sbert_embeddings, open("experimental_data/embeddings/test_lyrics_roberta_sbert_embeddings.pkl", "wb"))

In [21]:
sbert_chocolm_model = SentenceTransformer('jammai/chocolm-modernbert-base')

No sentence-transformers model found with name jammai/chocolm-modernbert-base. Creating a new one with mean pooling.


In [23]:
sbert_chocolm_train = sbert_chocolm_model.encode(train_chords, show_progress_bar=True)
sbert_chocolm_test = sbert_chocolm_model.encode(test_chords, show_progress_bar=True)

Batches: 100%|██████████| 87955/87955 [10:08<00:00, 144.55it/s]
Batches: 100%|██████████| 21959/21959 [02:19<00:00, 157.23it/s]


In [None]:
sbert_chocolm_train = torch.from_numpy(sbert_chocolm_train)
sbert_chocolm_test = torch.from_numpy(sbert_chocolm_test)
pickle.dump(sbert_chocolm_train, open("experimental_data/embeddings/train_chords_chocolm_sbert_embeddings.pkl", "wb"))
pickle.dump(sbert_chocolm_test, open("experimental_data/embeddings/test_chords_chocolm_sbert_embeddings.pkl", "wb"))

: 