# Lyrics search engine

In [2]:
# imports
!pip install rank_bm25
!pip install sentence_transformers
!pip install nltk
!pip install requests

import rank_bm25
import pandas as pd
from collections import defaultdict
import numpy as np
import json
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import requests




# Import dataset

In [24]:
ls= pd.read_csv('preprocessed_genius_lyrics.csv')
ls.drop(['Unnamed: 0'], axis=1, inplace=True)
ls=ls.rename(columns={'tag':'genre'})
# ls['preprocessed_lyrics'].tolist()

In [24]:
ls['lyrics']

0       [Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...
1       [Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...
2       [Produced by Kanye West and Brian Miller]\n\n[...
3       [Intro]\nSo they ask me\n"Young boy\nWhat you ...
4       [Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (You...
                              ...                        
1144    [Hook]\nAnother night, slips away\nIn other wo...
1145    [Intro: Michael Jackson and 60 Minutes' Ed Bra...
1146    [Chorus: Kid Cudi & John Legend]\nI got the wo...
1147    [Chorus 1: Bruno Mars & B.o.B]\nBeautiful girl...
1148    [Intro]\nK-K-K-K-K-K-Mac\n\n[Verse 1: Chris Br...
Name: lyrics, Length: 1149, dtype: object

# Indexer

The indexer creates an inverted index, mapping terms to their locations in documents for fast retrieval. 
- Needs preprocessed lyrics
- Consists of two levels: a vocabulary of index terms (typically words) and lists that map each term to the documents where it appears.
- Elasticsearch/Lucene builds inverted index
- Metadata fields (genre, year, and song section) indexed separately.
- Search engine incorporates a section-based filter: users to determine where their query terms appear in the lyrics (e.g., verse, chorus, bridge). - - These fields are structured using Elasticsearch mappings, allowing users to refine searches based on genre, release year, and specific song sections.

In [9]:
# Apache Lucene or ElasticSearch
# BM25 Inverted index
# Query processor BM25. Prepare corpus fro BM25 (tokenized texts)

def build_bm25_index(ls):
    inverted_index = defaultdict(dict)
    doc_lengths = {}
    total_docs = len(ls)

    for idx, row in ls.iterrows():
        doc_id = row['id']
        # Ensure tokens are separated properly
        tokens = row['preprocessed_lyrics']
        if isinstance(tokens, str):
            tokens = tokens.split()

        doc_lengths[doc_id] = len(tokens)

        term_freqs = defaultdict(int)
        for token in tokens:
            term_freqs[token] += 1

        for token, freq in term_freqs.items():
            inverted_index[token][doc_id] = freq

    return inverted_index, doc_lengths, total_docs

In [10]:
# BERT Embedder
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def build_bert_index(ls, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    lyrics_texts = ls['preprocessed_lyrics'].astype(str).tolist()
    embeddings = model.encode(lyrics_texts, convert_to_numpy=True)
    doc_ids = ls["id"].tolist()
    doc_embeddings = dict(zip(doc_ids, embeddings))
    return doc_embeddings, model  #  returns both index and model

In [11]:
bert_embeddings = np.load('bert_embeddings.npy',allow_pickle=False)

# Get document IDs from your DataFrame
doc_ids = ls["id"].tolist()

# Build a dictionary: {doc_id: embedding}
bert_embeddings = dict(zip(doc_ids, bert_embeddings))
# bert_embeddings

# Query processor
- Tokenizing, stemming, and normalizing the query using NLTK or spaCy
- Input: Query
1. Fuzzy matching incorporated using Levenshtein distance to approximate string matching
    - Datamuse API: Related words and phrases through Datamuse API
2. Filtering with Elasticsearch’s boolean queries (based on song metadata attributes, such as genre, release year, artist, and specific song sections)

In [12]:
base_stopwords = set(stopwords.words("english"))
lyrical_keep_words = {
    # Pronouns
    'i', 'you', 'me', 'we', 'my', 'your', 'she', 'her', 'his',
    # Negations (full words + contractions)
    'not', 'no', 'never',
    "don't", "can't", "won't", "didn't", "isn't", "aren't",
    # Emotional/vocal
    'oh', 'hey', 'yeah'
}

custom_stopwords = base_stopwords - lyrical_keep_words
stemmer = PorterStemmer()
def bm25_query(query):
    # Tokenize using the same regex: keep words, apostrophes, dashes, and brackets
    words = re.findall(r"[\w'\-\[\]]+", query.lower())

    # Keep if:
    # - in lyrical keep words
    # - contains an apostrophe (e.g., don't)
    # - not in base stopwords
    filtered = [
        word for word in words
        if (word in lyrical_keep_words) or ("'" in word) or (word not in base_stopwords)
    ]

    # Stem
    stemmed = [stemmer.stem(word) for word in filtered]

    return stemmed


In [13]:
# BERT Scoring logic
def encode_bert_query(query, model):
    # Encode the raw query string
    embedding = model.encode(query)
    return embedding

# Baseline model: BM25 + BERT
- Generates relevance scores for lyrics based on user's query

- Ranking layer 1: BM25+BERT
- Ranking layer 2: Datamuse API for phonetic alignment of lyrics with query

In [18]:
# Create ranking of lyrics with BM25

# Load preprocessed lyrics
tokenized_corpus = [str(doc).split() for doc in ls['preprocessed_lyrics']]
bm25 = BM25Okapi(tokenized_corpus)

def bm25_search(query, bm25_model, ls, top_k=5):
    tokens = preprocess_bm25_query(query)
    scores = bm25_model.get_scores(tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return ls.iloc[top_indices], scores 


In [19]:
# Embed query for BERT model to compare with document embeddings
from sentence_transformers import SentenceTransformer
def bert_search(query, doc_embeddings, model, ls, top_k=5):
    query_vec = encode_bert_query(query, model)

    # Compute cosine similarity to each document
    doc_ids = list(doc_embeddings.keys())
    doc_vecs = np.array([doc_embeddings[doc_id] for doc_id in doc_ids])
    
    similarities = cosine_similarity([query_vec], doc_vecs)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]

    top_doc_ids = [doc_ids[i] for i in top_indices]
    return ls[ls['id'].isin(top_doc_ids)][['title', 'artist', 'lyrics']]


# Enter query

In [17]:
# Step 1: Build BM25 model using preprocessed_lyrics
tokenized_corpus = [str(doc).split() for doc in ls['preprocessed_lyrics']]
bm25_model = BM25Okapi(tokenized_corpus)

# # Step 2: Build BERT index
# bert_index, bert_model = build_bert_index(ls, model_name)

# Step 3: Example Query
query = "Young boy"

print("BM25 Results:\n")
print(bm25_search(query, 5, ls))

print("\nBERT Results:\n")
print(bert_search(query, bert_embeddings, bert_model, ls))

BM25 Results:



NameError: name 'preprocess_bm25_query' is not defined

In [20]:
ls.head()

Unnamed: 0,title,genre,artist,year,views,features,lyrics,id,preprocessed_lyrics
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,[choru opera steve cam'ron] killa cam killa ca...
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,[produc irv gotti] [intro] yeah hah yeah roc-a...
2,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,[produc kany west brian miller] [intro cam'ron...
3,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,[intro] ask me young boy you gon' second time ...
4,Lollipop Remix,rap,Lil Wayne,2008,580832,"{""Kanye West"",""Static Major""}",[Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (You...,7,[intro lil wayne] haha uh-huh no homo young mu...


In [164]:
# Hybrid search
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def hybrid_search(query, ls, bm25_model, bert_model, bert_embeddings, top_k_bm25=20, top_k_final=5, weight=0.5, 
                  genre=None, artist=None, release_year=None, title=None):
    # Apply metadata filters
    filtered_ls = ls.copy()
    if title:
        
    if genre:
        filtered_ls = filtered_ls[filtered_ls['genre'].str.lower() == genre.lower()]
    if artist:
        filtered_ls = filtered_ls[filtered_ls['artist'].str.lower() == artist.lower()]
    if release_year:
        filtered_ls = filtered_ls[filtered_ls['year'] == release_year]

    if filtered_ls.empty:
        print("No matching documents found after applying filters.")
        return pd.DataFrame()  # Safe exit

    # Build BM25 model on filtered data
    filtered_lyrics_tokens = [bm25_query(lyrics) for lyrics in filtered_ls['preprocessed_lyrics']]
    bm25_model = BM25Okapi(filtered_lyrics_tokens)

    # Get BM25 scores and top results
    bm25_results, bm25_scores = bm25_search(query, bm25_model=bm25_model, ls=filtered_ls, top_k=top_k_bm25)
    bm25_top_ids = bm25_results['id'].tolist()

    # BERT query embedding
    query_embedding = encode_bert_query(query, bert_model)

    # Score using BERT similarity
    bert_scores = []
    bm25_selected_scores = []
    for doc_id in bm25_top_ids:
        doc_embedding = bert_embeddings[doc_id]
        sim = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        bert_scores.append(sim)

        # Get BM25 score using filtered_ls index
        index_in_filtered = filtered_ls[filtered_ls['id'] == doc_id].index[0]
        bm25_selected_scores.append(bm25_scores[filtered_ls.index.get_loc(index_in_filtered)])

    # Normalize scores
    bert_scores = np.array(bert_scores)
    bm25_selected_scores = np.array(bm25_selected_scores)

    bert_norm = (bert_scores - bert_scores.min()) / (bert_scores.max() - bert_scores.min() + 1e-8)
    bm25_norm = (bm25_selected_scores - bm25_selected_scores.min()) / (bm25_selected_scores.max() - bm25_selected_scores.min() + 1e-8)

    # Combine scores
    combined_scores = weight * bert_norm + (1 - weight) * bm25_norm
    sorted_indices = np.argsort(combined_scores)[::-1][:top_k_final]
    final_ids = [bm25_top_ids[i] for i in sorted_indices]

    # Build the result DataFrame
    results_df = ls[ls['id'].isin(final_ids)][['id', 'title', 'artist', 'lyrics']].copy()
    results_df['bm25_score'] = [bm25_selected_scores[i] for i in sorted_indices]
    results_df['bert_score'] = [bert_scores[i] for i in sorted_indices]
    results_df['combined_score'] = [combined_scores[i] for i in sorted_indices]

    return results_df.sort_values(by='combined_score', ascending=False).reset_index(drop=True)

In [21]:
# Same function with title filter incorporated and no errors?
def hybrid_search(query, ls, bm25_model, bert_model, bert_embeddings, top_k_bm25=20, top_k_final=5, weight=0.5, 
                  genre=None, artist=None, release_year=None, search_title_first=False):
    
    # Apply metadata filters
    filtered_ls = ls.copy()
    if genre:
        filtered_ls = filtered_ls[filtered_ls['genre'].str.lower() == genre.lower()]
    if artist:
        filtered_ls = filtered_ls[filtered_ls['artist'].str.lower() == artist.lower()]
    if release_year:
        filtered_ls = filtered_ls[filtered_ls['year'] == release_year]

    if filtered_ls.empty:
        print("No matching documents found after applying filters.")
        return pd.DataFrame()

    # First try title search
    if search_title_first:
        tokenized_titles = [bm25_query(title) for title in filtered_ls['title']]
        title_bm25 = BM25Okapi(tokenized_titles)
        query_tokens = bm25_query(query)
        title_scores = title_bm25.get_scores(query_tokens)
        
        top_title_indices = np.argsort(title_scores)[::-1][:top_k_final]
        top_title_scores = [title_scores[i] for i in top_title_indices]
        top_title_matches = filtered_ls.iloc[top_title_indices].copy()
        top_title_matches['bm25_score'] = top_title_scores
        
        # Optional: threshold to fall back to full search
        if top_title_matches['bm25_score'].max() > 1.0:  # Adjust threshold as needed
            return top_title_matches[['id', 'title', 'artist', 'lyrics']].reset_index(drop=True)

    # Proceed with full hybrid search (lyrics)
    tokenized_lyrics = [bm25_query(lyrics) for lyrics in filtered_ls['preprocessed_lyrics']]
    bm25_model = BM25Okapi(tokenized_lyrics)
    
    bm25_results, bm25_scores = bm25_search(query, bm25_model=bm25_model, ls=filtered_ls, top_k=top_k_bm25)
    bm25_top_ids = bm25_results['id'].tolist()

    # BERT query embedding
    query_embedding = encode_bert_query(query, bert_model)

    # BERT scores
    bert_scores = []
    bm25_selected_scores = []
    for doc_id in bm25_top_ids:
        doc_embedding = bert_embeddings[doc_id]
        sim = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        bert_scores.append(sim)

        index_in_filtered = filtered_ls[filtered_ls['id'] == doc_id].index[0]
        bm25_selected_scores.append(bm25_scores[filtered_ls.index.get_loc(index_in_filtered)])

    # Normalize
    bert_scores = np.array(bert_scores)
    bm25_selected_scores = np.array(bm25_selected_scores)

    bert_norm = (bert_scores - bert_scores.min()) / (bert_scores.max() - bert_scores.min() + 1e-8)
    bm25_norm = (bm25_selected_scores - bm25_selected_scores.min()) / (bm25_selected_scores.max() - bm25_selected_scores.min() + 1e-8)
    combined_scores = weight * bert_norm + (1 - weight) * bm25_norm

    # Final sort
    sorted_indices = np.argsort(combined_scores)[::-1][:top_k_final]
    final_ids = [bm25_top_ids[i] for i in sorted_indices]

    results_df = ls[ls['id'].isin(final_ids)][['id', 'title', 'artist', 'lyrics']].copy()
    results_df['bm25_score'] = [bm25_selected_scores[i] for i in sorted_indices]
    results_df['bert_score'] = [bert_scores[i] for i in sorted_indices]
    results_df['combined_score'] = [combined_scores[i] for i in sorted_indices]

    return results_df.sort_values(by='combined_score', ascending=False).reset_index(drop=True)


In [22]:
# Implement hybrid search
bm25_model = BM25Okapi([str(doc).split() for doc in ls['preprocessed_lyrics']])
query = "Alejandro"
results = hybrid_search(query, ls, bm25_model, bert_model, bert_embeddings, 20, 5, weight=0.5, genre= 'pop', search_title_first=True)
print(results)


      id        title             artist  \
0   1189    Alejandro          Lady Gaga   
1   1313  Bad Romance          Lady Gaga   
2  21575   Pirate Jet           Gorillaz   
3   1299    Paper Bag        Fiona Apple   
4   1282     My Girls  Animal Collective   

                                              lyrics  
0  [Intro]\nI know that we are young, and I know ...  
1  [Intro]\nOh-oh-oh-oh-oh, oh-oh-oh-oh, oh-oh-oh...  
2  [Verse: 2-D & The Purple, the People, the Plas...  
3  [Verse 1]\nI was staring at the sky, just look...  
4  [Intro]\nIsn't much that I feel I need\nA soli...  


# Augmented model: BM25 + Datamuse API + BERT

In [188]:
def get_related_words(query, max_results=10):
    url = "https://api.datamuse.com/words"
    params = {
        "ml": query,      # 'ml' = means like (semantic similarity)
        "max": max_results
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        words = [item['word'] for item in response.json()]
        return words
    else:
        print("Datamuse API error:", response.status_code)
        return []


# Expand the query by adding related terms from Datamuse
def expand_query_with_related_terms(query, max_related=5):
    related = get_related_words(query, max_related)
    expanded = query + " " + " ".join(related)
    return expanded


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [201]:
import requests

# Function to expand the query using the Datamuse API
def expand_query_with_related_terms(query, max_related=5):
    # Send request to Datamuse API to get related words and phrases
    response = requests.get(f"https://api.datamuse.com/words?ml={query}&max={max_related}")
    related_words = [word_info['word'] for word_info in response.json()]
    
    # Join related words with the original query for expansion
    expanded_query = query + " " + " ".join(related_words)
    return expanded_query

def hybrid_search(query, ls, bm25_model, bert_model, bert_embeddings, top_k_bm25=20, top_k_final=5, weight=0.5, 
                  genre=None, artist=None, release_year=None, search_title_first=False):
    
    # Apply metadata filters
    filtered_ls = ls.copy()
    if genre:
        filtered_ls = filtered_ls[filtered_ls['genre'].str.lower() == genre.lower()]
    if artist:
        filtered_ls = filtered_ls[filtered_ls['artist'].str.lower() == artist.lower()]
    if release_year:
        filtered_ls = filtered_ls[filtered_ls['year'] == release_year]

    if filtered_ls.empty:
        print("No matching documents found after applying filters.")
        return pd.DataFrame()

    # Expand query with related terms from Datamuse
    expanded_query = expand_query_with_related_terms(query, max_related=5)
    print(f"Expanded Query: {expanded_query}")  # Optional: print expanded query for debugging

    # First try title search
    if search_title_first:
        tokenized_titles = [bm25_query(title) for title in filtered_ls['title']]
        title_bm25 = BM25Okapi(tokenized_titles)
        query_tokens = bm25_query(expanded_query)
        title_scores = title_bm25.get_scores(query_tokens)
        
        top_title_indices = np.argsort(title_scores)[::-1][:top_k_final]
        top_title_scores = [title_scores[i] for i in top_title_indices]
        top_title_matches = filtered_ls.iloc[top_title_indices].copy()
        top_title_matches['bm25_score'] = top_title_scores
        
        # Optional: threshold to fall back to full search
        if top_title_matches['bm25_score'].max() > 1.0:  # Adjust threshold as needed
            return top_title_matches[['id', 'title', 'artist', 'lyrics']].reset_index(drop=True)

    # Proceed with full hybrid search (lyrics)
    tokenized_lyrics = [bm25_query(lyrics) for lyrics in filtered_ls['preprocessed_lyrics']]
    bm25_model = BM25Okapi(tokenized_lyrics)
    
    bm25_results, bm25_scores = bm25_search(expanded_query, bm25_model=bm25_model, ls=filtered_ls, top_k=top_k_bm25)
    bm25_top_ids = bm25_results['id'].tolist()

    # BERT query embedding
    query_embedding = encode_bert_query(expanded_query, bert_model)

    # BERT scores
    bert_scores = []
    bm25_selected_scores = []
    for doc_id in bm25_top_ids:
        doc_embedding = bert_embeddings[doc_id]
        sim = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        bert_scores.append(sim)

        index_in_filtered = filtered_ls[filtered_ls['id'] == doc_id].index[0]
        bm25_selected_scores.append(bm25_scores[filtered_ls.index.get_loc(index_in_filtered)])

    # Normalize
    bert_scores = np.array(bert_scores)
    bm25_selected_scores = np.array(bm25_selected_scores)

    bert_norm = (bert_scores - bert_scores.min()) / (bert_scores.max() - bert_scores.min() + 1e-8)
    bm25_norm = (bm25_selected_scores - bm25_selected_scores.min()) / (bm25_selected_scores.max() - bm25_selected_scores.min() + 1e-8)
    combined_scores = weight * bert_norm + (1 - weight) * bm25_norm

    # Final sort
    sorted_indices = np.argsort(combined_scores)[::-1][:top_k_final]
    final_ids = [bm25_top_ids[i] for i in sorted_indices]

    results_df = ls[ls['id'].isin(final_ids)][['id', 'title', 'artist', 'lyrics']].copy()
    results_df['bm25_score'] = [bm25_selected_scores[i] for i in sorted_indices]
    results_df['bert_score'] = [bert_scores[i] for i in sorted_indices]
    results_df['combined_score'] = [combined_scores[i] for i in sorted_indices]

    return results_df.sort_values(by='combined_score', ascending=False).reset_index(drop=True)




In [202]:
# Implement hybrid search
bm25_model = BM25Okapi([str(doc).split() for doc in ls['preprocessed_lyrics']])
query = "don't call my name"
results = hybrid_search(query, ls, bm25_model, bert_model, bert_embeddings, 20, 5, weight=0.5, genre= 'pop', search_title_first=False)
print(results)


Expanded Query: don't call my name names naming term give title
     id               title           artist  \
0   741             Anxiety  Black Eyed Peas   
1   911           Ego Remix           Beyonc   
2   916         Billionaire     Travie McCoy   
3  1170  Runaway Love Remix    Justin Bieber   
4  1189           Alejandro        Lady Gaga   

                                              lyrics  bm25_score  bert_score  \
0  [Verse 1]\nI feel like I wanna smack somebody\...    7.396500    0.318183   
1  [Verse 1: Kanye West]\nI got a big ego (Ha ha ...    6.256792    0.213124   
2  [Pre-Chorus: Bruno Mars]\nI wanna be a billion...    5.148655    0.141291   
3  [Intro: Raekwon]\nVisual visual, JB\nYo, Ye, w...    2.757639    0.157454   
4  [Intro]\nI know that we are young, and I know ...    2.791590    0.147648   

   combined_score  
0        1.000000  
1        0.738902  
2        0.538147  
3        0.404832  
4        0.389947  


# Analyze results

In [189]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Exact match precision and recall calculation
def exact_match_precision_recall(retrieved_titles, relevant_titles, k=5):

    retrieved_set = set(retrieved_titles[:k])  # Top k retrieved titles
    relevant_set = set(relevant_titles)  # Ground truth titles
    
    # Precision
    precision = len(retrieved_set & relevant_set) / k
    # Recall
    recall = len(retrieved_set & relevant_set) / len(relevant_titles) if relevant_titles else 0.0
    
    return precision, recall

# F1-score 
def f1_at_k(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

# NDCG (by title relevance)
def ndcg_at_k(retrieved_titles, relevant_titles, k=5):
    """
    NDCG for title-based search.
    """
    def dcg(scores):
        return sum(score / np.log2(idx + 2) for idx, score in enumerate(scores))
    
    # Create binary relevance scores (1 if the title is in ground truth, else 0)
    relevance_scores = [1 if title in relevant_titles else 0 for title in retrieved_titles[:k]]
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)
    
    dcg_val = dcg(relevance_scores)
    idcg_val = dcg(ideal_relevance_scores)
    
    return dcg_val / idcg_val if idcg_val != 0 else 0.0

# Example queries
queries = ["killer queen", "we will we will rock you", "she wears short skirts i wear"]


ground_truth_titles = {
    "killer queen": ["Killer Queen", "Alone in the Dark", "Solitude Anthem"],
    "we will we will rock you": ["We Will Rock You", "Hot Summer Nights", "Sunshine Romance"],
    "she wears short skirts i wear": ["You Belong With Me", "Broken Hearted", "Dreams Gone Bad"]
}

for query in queries:
    retrieved_titles = search_titles(query)  # Assume the search engine returns top 5 titles
    relevant_titles = ground_truth_titles.get(query, [])  # The ground truth titles for this query

    # Exact Match Evaluation
    exact_precision, exact_recall = exact_match_precision_recall(retrieved_titles, relevant_titles, k=5)
    
    # F1-Score 
    exact_f1 = f1_at_k(exact_precision, exact_recall)
    # NDCG score by title
    ndcg = ndcg_at_k(retrieved_titles, relevant_titles, k=5)

    print(f"\nQuery: '{query}'")
    print(f"Exact Match Precision@5: {exact_precision:.2f}")
    print(f"Exact Match Recall@5:    {exact_recall:.2f}")
    print(f"Exact Match F1@5:        {exact_f1:.2f}")
    print(f"NDCG@5 (Title):          {ndcg:.2f}")




In [33]:
ls[ls['artist']=='Queen']

Unnamed: 0,title,genre,artist,year,views,features,lyrics,id,preprocessed_lyrics
571,Bohemian Rhapsody,rock,Queen,1975,9247817,{},[Intro]\nIs this the real life? Is this just f...,1063,[intro] real life fantasi caught landslid no e...


# References
https://medium.com/@bormotovk/hybrid-retrieval-combining-bert-and-bm25-for-enhanced-performance-4f6f80881c13
