In [2]:
import logging
import os
import pickle
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

In [5]:
df = pd.read_parquet(r'../cleaned dataset/track_cleaned.parquet')
df

Unnamed: 0,track_idx,track_uri,album_name,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,lyrics,duration
61588,0,spotify:track:000DfZJww8KiixTKuk9usJ,The Change I'm Seeking,Mike Love,Earthlings,0.631,0.513,2.0,-6.376,1.0,0.0293,0.366000,0.000004,0.1090,0.307,120.365,4.0,I just can't take no more\n I gotta get out of...,357
125100,1,spotify:track:000GjfnQc7ggBayDiy1sLW,Y las Mariposas,El Poder De Zacatecas,Abeja Miope,0.913,0.748,9.0,-3.274,1.0,0.0428,0.074500,0.000956,0.0403,0.864,114.143,4.0,,140
208027,2,spotify:track:000JCyEkMFumqCZQJAORiQ,Enough Is Enough,Nipsey Hussle,California Water,0.795,0.874,0.0,-4.523,1.0,0.2100,0.064600,0.000000,0.3410,0.483,132.966,4.0,,207
102801,3,spotify:track:000VZqvXwT0YNqKk7iG2GS,Dear Youth,The Ghost Inside,Mercy,0.444,0.991,7.0,-4.167,1.0,0.1330,0.000085,0.000084,0.1200,0.106,124.016,4.0,For whom the bell tolls\n There's a hurricane ...,256
69142,4,spotify:track:000uWezkHfg6DbUPf2eDFO,Dancehall Days,The Beautiful Girls,Me I Disconnect From You,0.714,0.635,1.0,-10.769,1.0,0.0299,0.001940,0.259000,0.0839,0.360,134.007,4.0,,321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46540,252231,spotify:track:7zywhdkPljk4fOyWkh3EqI,Live & Solo At the Artists Den,Ben Kweller,Lizzy,0.575,0.269,7.0,-8.274,1.0,0.0365,0.779000,0.000000,0.6670,0.403,110.171,4.0,Sign me up I volunteer\nVotes are in for lifeg...,233
117614,252232,spotify:track:7zzBEZBTJejWeL6EqWmCD9,All This Bad Blood,Bastille,Get Home,0.599,0.525,9.0,-6.745,0.0,0.0397,0.729000,0.000046,0.0909,0.186,115.665,4.0,"\nHow am I gonna get myself back home?\nI, I, ...",191
195994,252233,spotify:track:7zzLt6Z9y7jMvXnEg00n58,The Sunny Album (Deluxe Edition),Hippie Sabotage,Quit Wastin Time,0.744,0.581,8.0,-10.225,0.0,0.1460,0.593000,0.958000,0.2040,0.679,126.910,4.0,No Lyrics,69
189208,252234,spotify:track:7zzbfi8fvHe6hm342GcNYl,Ace,Bob Weir,Black-Throated Wind,0.533,0.547,9.0,-9.290,1.0,0.0326,0.029900,0.011300,0.0723,0.669,72.506,4.0,"Youre bringing me down, Im running aground,\n ...",342


In [6]:
# -------------------------------
# 1️⃣ Setup Logging
# -------------------------------
logging.basicConfig(
    filename="embedding_generation.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

logging.info("Starting embedding generation...")

# -------------------------------
# 2️⃣ Load Dataset & Model
# -------------------------------
df = pd.read_parquet(r'../cleaned dataset/track_cleaned.parquet')

# Ensure lyrics column exists
if 'lyrics' not in df.columns:
    raise ValueError("Dataset must contain a 'lyrics' column.")

df = df.dropna(subset=['lyrics'])  # Drop missing lyrics
df = df[['track_idx','lyrics']]
df = df[:1000]  # Optional: limit data for faster processing

# Load multilingual BERT model
multi_bert_embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
logging.info("Model loaded successfully!")

# -------------------------------
# 3️⃣ Resume Progress if Available
# -------------------------------
save_path = "lyrics_embeddings_checkpoint.pkl"

if os.path.exists(save_path):
    with open(save_path, "rb") as f:
        saved_data = pickle.load(f)
        lyrics_embeddings = saved_data["embeddings"]
        last_processed_idx = saved_data["last_processed_idx"]
        logging.info(f"Resuming from song {last_processed_idx}/{len(df)}")
else:
    lyrics_embeddings = []
    last_processed_idx = 0

# -------------------------------
# 4️⃣ Process Lyrics with Batch Encoding
# -------------------------------
lyrics_list = df['lyrics'].tolist()
total_lyrics = len(lyrics_list)

logging.info(f"Processing {total_lyrics} lyrics for embedding generation.")

batch_size = 32  

# Initialize tqdm progress bar only once
progress_bar = tqdm(range(last_processed_idx, total_lyrics, batch_size), desc="Generating Embeddings", unit="batch")

for i in progress_bar:
    batch_lyrics = lyrics_list[i:i+batch_size]
    
    batch_embeddings = multi_bert_embedder.encode(batch_lyrics, convert_to_tensor=True).cpu().numpy()
    
    lyrics_embeddings.extend(batch_embeddings)

    # Update tqdm bar text instead of creating a new bar
    progress_bar.set_description(f"Processing {i+batch_size}/{total_lyrics}")

    # Save progress every 100 songs
    if (i + batch_size) % 100 < batch_size:
        with open(save_path, "wb") as f:
            pickle.dump({"embeddings": lyrics_embeddings, "last_processed_idx": i + batch_size}, f)
        tqdm.write(f"Checkpoint saved at song {i + batch_size}")
        logging.info(f"Checkpoint saved at song {i + batch_size}")

logging.info("Embedding generation completed!")

# -------------------------------
# 5️⃣ Merge Embeddings with Original Data
# -------------------------------
df_embeddings = pd.DataFrame(lyrics_embeddings, index=df.index[:len(lyrics_embeddings)])

# Merge embeddings back into original dataframe
df_final = df.iloc[:len(df_embeddings)].copy()  # Ensure index consistency
df_final = pd.concat([df_final, df_embeddings], axis=1)

# Save final output
df_final.to_csv("lyrics_with_embeddings.csv", index=True)  # Keeping index for tracking

logging.info("Final embeddings saved successfully!")
os.remove(save_path)  # Delete checkpoint after success

print("✅ Embedding generation complete! Merged with original dataset and saved as 'lyrics_with_embeddings.csv'.")

Generating Embeddings:   0%|          | 0/32 [00:00<?, ?batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 32/1000:   3%|▎         | 1/32 [00:00<00:11,  2.76batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 64/1000:   6%|▋         | 2/32 [00:00<00:08,  3.38batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 96/1000:   9%|▉         | 3/32 [00:00<00:07,  3.97batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 128/1000:  12%|█▎        | 4/32 [00:01<00:06,  4.27batch/s]

Checkpoint saved at song 128


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 160/1000:  16%|█▌        | 5/32 [00:01<00:05,  4.64batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 192/1000:  19%|█▉        | 6/32 [00:01<00:05,  4.57batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 224/1000:  22%|██▏       | 7/32 [00:01<00:05,  4.78batch/s]

Checkpoint saved at song 224


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 256/1000:  25%|██▌       | 8/32 [00:01<00:05,  4.75batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 288/1000:  28%|██▊       | 9/32 [00:02<00:04,  4.89batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 320/1000:  31%|███▏      | 10/32 [00:02<00:04,  4.83batch/s]

Checkpoint saved at song 320


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 352/1000:  34%|███▍      | 11/32 [00:02<00:04,  4.97batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 384/1000:  38%|███▊      | 12/32 [00:02<00:04,  4.74batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 416/1000:  41%|████      | 13/32 [00:02<00:03,  4.77batch/s]

Checkpoint saved at song 416


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 448/1000:  44%|████▍     | 14/32 [00:03<00:03,  4.64batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 480/1000:  47%|████▋     | 15/32 [00:03<00:03,  4.79batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 512/1000:  50%|█████     | 16/32 [00:03<00:03,  4.85batch/s]

Checkpoint saved at song 512


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 544/1000:  53%|█████▎    | 17/32 [00:03<00:03,  4.97batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 576/1000:  56%|█████▋    | 18/32 [00:03<00:02,  4.95batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 608/1000:  59%|█████▉    | 19/32 [00:04<00:02,  4.91batch/s]

Checkpoint saved at song 608


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 640/1000:  62%|██████▎   | 20/32 [00:04<00:02,  5.01batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 672/1000:  66%|██████▌   | 21/32 [00:04<00:02,  5.12batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 704/1000:  69%|██████▉   | 22/32 [00:04<00:01,  5.05batch/s]

Checkpoint saved at song 704


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 736/1000:  72%|███████▏  | 23/32 [00:04<00:01,  5.00batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 768/1000:  75%|███████▌  | 24/32 [00:05<00:01,  4.94batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 800/1000:  78%|███████▊  | 25/32 [00:05<00:01,  5.10batch/s]

Checkpoint saved at song 800


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 832/1000:  81%|████████▏ | 26/32 [00:05<00:01,  5.47batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 864/1000:  84%|████████▍ | 27/32 [00:05<00:00,  5.79batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 896/1000:  88%|████████▊ | 28/32 [00:05<00:00,  6.00batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 928/1000:  91%|█████████ | 29/32 [00:05<00:00,  6.21batch/s]

Checkpoint saved at song 928


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 960/1000:  94%|█████████▍| 30/32 [00:06<00:00,  6.24batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 992/1000:  97%|█████████▋| 31/32 [00:06<00:00,  6.42batch/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 1024/1000: 100%|██████████| 32/32 [00:06<00:00,  5.09batch/s]


Checkpoint saved at song 1024
✅ Embedding generation complete! Merged with original dataset and saved as 'lyrics_with_embeddings.csv'.


In [7]:
import torch
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# -------------------------------
# 1️⃣ Load Dataset
# -------------------------------
df = pd.read_csv("lyrics_with_embeddings.csv")  # Ensure it contains the 'lyrics' column

if 'lyrics' not in df.columns:
    raise ValueError("The dataset must contain a 'lyrics' column.")

df = df.dropna(subset=['lyrics'])  # Remove missing lyrics
lyrics_list = df['lyrics'].tolist()  

# -------------------------------
# 2️⃣ Load Pre-trained Zero-Shot Classifier (Enable GPU)
# -------------------------------
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)

# Define possible emotions
emotions = [
    "joy", "anger", "sadness", "fear", "love", "surprise", "disgust", "hope", "pride", "relief"
]

# -------------------------------
# 3️⃣ Perform Zero-Shot Classification (Batch Processing)
# -------------------------------
batch_size = 16  # Adjust batch size based on available memory
predicted_labels = []
predicted_scores = []

print(f"🚀 Processing {len(lyrics_list)} lyrics in batches of {batch_size}...")

for i in tqdm(range(0, len(lyrics_list), batch_size), desc="Classifying Emotions"):
    batch_lyrics = lyrics_list[i:i+batch_size]
    
    # Run classifier in batch mode
    results = classifier(batch_lyrics, candidate_labels=emotions, multi_label=True)
    
    # Extract the most probable emotions and scores
    for result in results:
        predicted_labels.append(result["labels"][0])  # Highest-ranked emotion
        predicted_scores.append(result["scores"][0])  # Confidence score

# -------------------------------
# 4️⃣ Save Results to CSV
# -------------------------------
df_filtered = df.iloc[:len(predicted_labels)].copy()
df_filtered["predicted_emotion"] = predicted_labels
df_filtered["emotion_score"] = predicted_scores

df_filtered[['predicted_emotion','emotion_score']].to_csv("lyrics_with_predicted_emotions.csv", index=False)
print("✅ Emotion classification complete! Results saved to 'lyrics_with_predicted_emotions.csv'.")

Device set to use cpu


🚀 Processing 999 lyrics in batches of 16...


Classifying Emotions:   0%|          | 0/63 [00:15<?, ?it/s]


KeyboardInterrupt: 

In [None]:
df = pd.read_csv(r'/Users/xavierhua/Documents/GitHub/bt4222grp9/testing/lyrics_with_predicted_emotions.csv')
df = df[['track_idx','predicted_emotion','emotion_score']]
df.sort_values('emotion_score')

Unnamed: 0,track_idx,predicted_emotion,emotion_score
41,60,anger,0.494299
49,72,relief,0.498735
36,54,relief,0.498735
15,22,relief,0.498735
48,70,relief,0.498735
...,...,...,...
34,52,relief,0.995897
67,98,sadness,0.996977
39,57,surprise,0.997426
86,121,sadness,0.997516
