# Lyrics search engine

In [1]:
# imports
!pip install rank_bm25
!pip install sentence_transformers
import rank_bm25
import pandas as pd
from collections import defaultdict
import numpy as np
import json
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

Collecting rank_bm25
  Using cached rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Using cached rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Collecting sentence_transformers
  Using cached sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Using cached transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub>=0.20.0->sentence_transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu

# Import dataset

In [2]:
ls= pd.read_csv('preprocessed_genius_lyrics.csv')
ls.drop(['Unnamed: 0'], axis=1, inplace=True)
ls.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,preprocessed_lyrics
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,[choru opera steve cam'ron] killa cam killa ca...
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,[produc irv gotti] [intro] yeah hah yeah roc-a...
2,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,[produc kany west brian miller] [intro cam'ron...
3,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,[intro] ask me young boy you gon' second time ...
4,Lollipop Remix,rap,Lil Wayne,2008,580832,"{""Kanye West"",""Static Major""}",[Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (You...,7,[intro lil wayne] haha uh-huh no homo young mu...


In [24]:
ls['lyrics']

0       [Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...
1       [Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...
2       [Produced by Kanye West and Brian Miller]\n\n[...
3       [Intro]\nSo they ask me\n"Young boy\nWhat you ...
4       [Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (You...
                              ...                        
1144    [Hook]\nAnother night, slips away\nIn other wo...
1145    [Intro: Michael Jackson and 60 Minutes' Ed Bra...
1146    [Chorus: Kid Cudi & John Legend]\nI got the wo...
1147    [Chorus 1: Bruno Mars & B.o.B]\nBeautiful girl...
1148    [Intro]\nK-K-K-K-K-K-Mac\n\n[Verse 1: Chris Br...
Name: lyrics, Length: 1149, dtype: object

# Indexer

The indexer creates an inverted index, mapping terms to their locations in documents for fast retrieval. 
- Needs preprocessed lyrics
- Consists of two levels: a vocabulary of index terms (typically words) and lists that map each term to the documents where it appears.
- Elasticsearch/Lucene builds inverted index
- Metadata fields (genre, year, and song section) indexed separately.
- Search engine incorporates a section-based filter: users to determine where their query terms appear in the lyrics (e.g., verse, chorus, bridge). - - These fields are structured using Elasticsearch mappings, allowing users to refine searches based on genre, release year, and specific song sections.

In [4]:
# Apache Lucene or ElasticSearch
# BM25 Inverted index
# Query processor BM25. Prepare corpus fro BM25 (tokenized texts)

def build_bm25_index(ls):
    inverted_index = defaultdict(dict)
    doc_lengths = {}
    total_docs = len(ls)

    for idx, row in ls.iterrows():
        doc_id = row['id']
        # Ensure tokens are separated properly
        tokens = row['preprocessed_lyrics']
        if isinstance(tokens, str):
            tokens = tokens.split()

        doc_lengths[doc_id] = len(tokens)

        term_freqs = defaultdict(int)
        for token in tokens:
            term_freqs[token] += 1

        for token, freq in term_freqs.items():
            inverted_index[token][doc_id] = freq

    return inverted_index, doc_lengths, total_docs

In [44]:
# BERT Embedder
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def build_bert_index(ls, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    lyrics_texts = ls['preprocessed_lyrics'].astype(str).tolist()
    embeddings = model.encode(lyrics_texts, convert_to_numpy=True)
    doc_ids = ls["id"].tolist()
    doc_embeddings = dict(zip(doc_ids, embeddings))
    return doc_embeddings, model  #  returns both index and model

In [36]:
bert_embeddings = np.load('bert_embeddings.npy',allow_pickle=False)

# Get document IDs from your DataFrame
doc_ids = ls["id"].tolist()

# Build a dictionary: {doc_id: embedding}
bert_embeddings = dict(zip(doc_ids, bert_embeddings))
# bert_embeddings

# Query processor
- Tokenizing, stemming, and normalizing the query using NLTK or spaCy
- Input: Query
1. Fuzzy matching incorporated using Levenshtein distance to approximate string matching
    - Datamuse API: Related words and phrases through Datamuse API
2. Filtering with Elasticsearch’s boolean queries (based on song metadata attributes, such as genre, release year, artist, and specific song sections)

In [30]:
#BM25 Scoring logic
import re

def preprocess_bm25_query(query):
    # Lowercase, remove non-alphanumeric chars, split
    query = query.lower()
    tokens = re.findall(r'\b\w+\b', query)  # Only words

    return tokens  # Return list of tokens


In [31]:
# BERT Scoring logic
def encode_bert_query(query, model):
    # Encode the raw query string
    embedding = model.encode(query)
    return embedding

# Baseline model: BM25 + BERT
- Generates relevance scores for lyrics based on user's query

- Ranking layer 1: BM25+BERT
- Ranking layer 2: Datamuse API for phonetic alignment of lyrics with query

In [8]:
# Create ranking of lyrics with BM25

# Load preprocessed lyrics
tokenized_corpus = [str(doc).split() for doc in ls['preprocessed_lyrics']]
bm25 = BM25Okapi(tokenized_corpus)

def bm25_search(query, top_k=5):
    tokens = preprocess_bm25_query(query)
    scores = bm25.get_scores(tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return ls.iloc[top_indices][['title', 'artist', 'lyrics']]


In [32]:
# Embed query for BERT model to compare with document embeddings
from sentence_transformers import SentenceTransformer
def bert_search(query, doc_embeddings, model, ls, top_k=5):
    query_vec = encode_bert_query(query, model)

    # Compute cosine similarity to each document
    doc_ids = list(doc_embeddings.keys())
    doc_vecs = np.array([doc_embeddings[doc_id] for doc_id in doc_ids])
    
    similarities = cosine_similarity([query_vec], doc_vecs)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]

    top_doc_ids = [doc_ids[i] for i in top_indices]
    return ls[ls['id'].isin(top_doc_ids)][['title', 'artist', 'lyrics']]

# def build_bert_index(ls, model_name="sentence-transformers/all-MiniLM-L6-v2"):
#     model = SentenceTransformer(model_name)
#     lyrics_texts = ls['preprocessed_lyrics'].astype(str).tolist()
#     embeddings = model.encode(lyrics_texts, convert_to_numpy=True)
#     doc_ids = ls["id"].tolist()
#     doc_embeddings = dict(zip(doc_ids, embeddings))
#     return doc_embeddings, model  #  returns both index and model



# Enter query

In [37]:
# Step 1: Build BM25 model using preprocessed_lyrics
tokenized_corpus = [str(doc).split() for doc in ls['preprocessed_lyrics']]
bm25_model = BM25Okapi(tokenized_corpus)

# # Step 2: Build BERT index
# bert_index, bert_model = build_bert_index(ls, model_name)

# Step 3: Example Query
query = "Young boy"

print("BM25 Results:\n")
print(bm25_search(query, 5))

print("\nBERT Results:\n")
print(bert_search(query, bert_embeddings, bert_model, ls))

BM25 Results:

                  title     artist  \
61             Go Crazy      Jeezy   
134  This Is the Carter  Lil Wayne   
847            Mr. 17.5      Jeezy   
174         I Told Yall  Lil Wayne   
320     Look Like Jesus      Lil B   

                                                lyrics  
61   [Produced by Don Cannon]\n\n[Verse 1: Young Je...  
134  [Intro: Lil Wayne & Mannie Fresh]\nOkay, um, f...  
847  [Verse 1: Young Jeezy]\nNew shoes on the Range...  
174  [Intro: DJ K-Swift & Lil Wayne]\nTold y'all, I...  
320  [Intro]\nNiggas hatin on me bro, but I don't g...  

BERT Results:

                          title     artist  \
3                        Fly In  Lil Wayne   
12   What Happened to That Boy?    Birdman   
413                My President      Jeezy   
495                   Hype Boys       Sway   
590               Young Forever      JAY-Z   

                                                lyrics  
3    [Intro]\nSo they ask me\n"Young boy\nWhat you ...  
12   [I

In [41]:
# Hybrid search
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def hybrid_search(query, ls, bm25_model, bert_model, bert_embeddings, top_k_bm25=20, top_k_final=5, weight=0.5):

    # Step 1: BM25 Search
    bm25_tokens = query.lower().split()
    bm25_scores = bm25_model.get_scores(bm25_tokens)
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k_bm25]
    bm25_top_ids = ls.iloc[bm25_top_indices]['id'].tolist()
    
    # Step 2: BERT query embedding
    query_embedding = bert_model.encode(query, convert_to_numpy=True)

    # Step 3: BERT scores for BM25 top candidates
    bert_scores = []
    bm25_selected_scores = []
    for doc_id in bm25_top_ids:
        doc_embedding = bert_embeddings[doc_id]
        sim = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        bert_scores.append(sim)
        bm25_selected_scores.append(bm25_scores[ls[ls['id'] == doc_id].index[0]])

    # Step 4: Normalize scores
    bert_scores = np.array(bert_scores)
    bm25_selected_scores = np.array(bm25_selected_scores)

    bert_norm = (bert_scores - bert_scores.min()) / (bert_scores.max() - bert_scores.min() + 1e-8)
    bm25_norm = (bm25_selected_scores - bm25_selected_scores.min()) / (bm25_selected_scores.max() - bm25_selected_scores.min() + 1e-8)

    # Step 5: Combine and sort
    combined_scores = weight * bert_norm + (1 - weight) * bm25_norm
    sorted_indices = np.argsort(combined_scores)[::-1][:top_k_final]
    final_ids = [bm25_top_ids[i] for i in sorted_indices]

    return ls[ls['id'].isin(final_ids)][['id', 'title', 'artist', 'lyrics']]


In [51]:
# Implement hybrid search
bm25_model = BM25Okapi([str(doc).split() for doc in ls['preprocessed_lyrics']])
query = "i hear jerusalem bells"
results = hybrid_search(query, ls, bm25_model, bert_model, bert_embeddings, 20, 5, weight=0.1)
print(results)


        id         title     artist  \
70      83  Mrs. Officer  Lil Wayne   
347    410       My Time   Fabolous   
711    820         Trill     Clipse   
915   1077     Music Box     Eminem   
1135  1395     Blasphemy       2Pac   

                                                 lyrics  
70    [Intro: Bobby Valentino & (Lil Wayne)]\nAyy\nA...  
347   [Hook: Jeremih]\nGo hard today\nCan't worry 'b...  
711   [Chorus: Pharrell]\nUh, I got jewels (Uh), plu...  
915   [Intro]\nYeah\nYeah, girl\nCan you hear that? ...  
1135  [Intro: This Week In Bible Prophecy]\nGod has ...  


# Analyze results

In [None]:
#Analyze results
def analyze_results(results_df):
    comparison_counts = results_df['comparison'].value_counts()

    total_results = len(results_df)

    analysis_df = pd.DataFrame({
        "Count": comparison_counts,
        "Percentage": (comparison_counts / total_results) * 100
    })

    for status in ['Improved', 'Unchanged', 'Worsened']:
        if status not in analysis_df.index:
            analysis_df.loc[status] = [0, 0.0]  # Add missing status with 0 count and 0% percentage

    analysis_df = analysis_df.loc[['Improved', 'Unchanged', 'Worsened']]

    return analysis_df

# Augmented model: BM25 + Datamuse API + BERT

# References
https://medium.com/@bormotovk/hybrid-retrieval-combining-bert-and-bm25-for-enhanced-performance-4f6f80881c13
