# Lyrics search engine

In [43]:
# imports
# !pip install rank_bm25
# !pip install sentence_transformers
import rank_bm25
import pandas as pd
from collections import defaultdict
import numpy as np
import json
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity

# Import dataset

In [29]:
ls= pd.read_csv('preprocessed_genius_lyrics.csv')
ls.drop(['Unnamed: 0'], axis=1, inplace=True)
ls.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,preprocessed_lyrics
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,[choru opera steve cam'ron] killa cam killa ca...
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,[produc irv gotti] [intro] yeah hah yeah roc-a...
2,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,[produc kany west brian miller] [intro cam'ron...
3,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,[intro] ask me young boy you gon' second time ...
4,Lollipop Remix,rap,Lil Wayne,2008,580832,"{""Kanye West"",""Static Major""}",[Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (You...,7,[intro lil wayne] haha uh-huh no homo young mu...


In [14]:
ls['Unnamed: 0']

0          0
1          1
2          2
3          3
4          4
        ... 
1144    1144
1145    1145
1146    1146
1147    1147
1148    1148
Name: Unnamed: 0, Length: 1149, dtype: int64

# Indexer

The indexer creates an inverted index, mapping terms to their locations in documents for fast retrieval. 
- Needs preprocessed lyrics
- Consists of two levels: a vocabulary of index terms (typically words) and lists that map each term to the documents where it appears.
- Elasticsearch/Lucene builds inverted index
- Metadata fields (genre, year, and song section) indexed separately.
- Search engine incorporates a section-based filter: users to determine where their query terms appear in the lyrics (e.g., verse, chorus, bridge). - - These fields are structured using Elasticsearch mappings, allowing users to refine searches based on genre, release year, and specific song sections.

In [35]:
# Apache Lucene or ElasticSearch
# BM25 Inverted index
# Query processor BM25. Prepare corpus fro BM25 (tokenized texts)

def build_bm25_index(ls):
    inverted_index = defaultdict(dict)
    doc_lengths = {}
    total_docs = len(ls)

    for idx, row in ls.iterrows():
        doc_id = row['id']
        # Ensure tokens are separated properly
        tokens = row['preprocessed_lyrics']
        if isinstance(tokens, str):
            tokens = tokens.split()

        doc_lengths[doc_id] = len(tokens)

        term_freqs = defaultdict(int)
        for token in tokens:
            term_freqs[token] += 1

        for token, freq in term_freqs.items():
            inverted_index[token][doc_id] = freq

    return inverted_index, doc_lengths, total_docs

In [36]:
# BERT Embedder
from sentence_transformers import SentenceTransformer
def build_bert_index(ls, model_name='sentence-transformers/all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    doc_texts = ls['preprocessed_lyrics'].astype(str).tolist()
    doc_ids = ls['id'].tolist()

    # Get embeddings
    embeddings = model.encode(doc_texts, convert_to_numpy=True, show_progress_bar=True)

    # Return doc IDs and embeddings
    return dict(zip(doc_ids, embeddings))

# Query processor
- Tokenizing, stemming, and normalizing the query using NLTK or spaCy
- Input: Query
1. Fuzzy matching incorporated using Levenshtein distance to approximate string matching
    - Datamuse API: Related words and phrases through Datamuse API
2. Filtering with Elasticsearch’s boolean queries (based on song metadata attributes, such as genre, release year, artist, and specific song sections)

In [39]:
#BM25 Scoring logic
import re

def preprocess_bm25_query(query):
    # Lowercase, remove non-alphanumeric chars, split
    query = query.lower()
    tokens = re.findall(r'\b\w+\b', query)  # Only words

    return tokens  # Return list of tokens


In [40]:
# BERT Scoring logic
def encode_bert_query(query, model):
    # Encode the raw query string
    embedding = model.encode(query, convert_to_numpy=True)
    return embedding

# Baseline model: BM25 + BERT
- Generates relevance scores for lyrics based on user's query

- Ranking layer 1: BM25+BERT
- Ranking layer 2: Datamuse API for phonetic alignment of lyrics with query

In [None]:
# Create ranking of lyrics with BM25

# Load preprocessed lyrics
tokenized_corpus = [str(doc).split() for doc in ls['preprocessed_lyrics']]
bm25 = BM25Okapi(tokenized_corpus)

def bm25_search(query, top_k=5):
    tokens = preprocess_bm25_query(query)
    scores = bm25.get_scores(tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return ls.iloc[top_indices][['title', 'artist', 'lyrics']]


In [41]:
# Embed query for BERT model to compare with document embeddings

def bert_search(query, doc_embeddings, model, ls, top_k=5):
    query_vec = encode_bert_query(query, model)

    # Compute cosine similarity to each document
    doc_ids = list(doc_embeddings.keys())
    doc_vecs = np.array([doc_embeddings[doc_id] for doc_id in doc_ids])
    
    similarities = cosine_similarity([query_vec], doc_vecs)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]

    top_doc_ids = [doc_ids[i] for i in top_indices]
    return ls[ls['id'].isin(top_doc_ids)][['title', 'artist', 'lyrics']]


# Enter query

In [44]:
# Step 1: Build BM25 model using preprocessed_lyrics
tokenized_corpus = [str(doc).split() for doc in ls['preprocessed_lyrics']]
bm25_model = BM25Okapi(tokenized_corpus)

# Step 2: Build BERT index
bert_index, bert_model = build_bert_index(ls)

# Step 3: Example Query
query = "feeling heartbroken and lost love"

print("BM25 Results:\n")
print(bm25_search(query, ls, bm25_model))

print("\nBERT Results:\n")
print(bert_search(query, ls, bert_index, bert_model))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)

In [45]:
# Hybrid search
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

def hybrid_search(query, ls, bm25_model, bert_model, bert_index, top_k_bm25=20, top_k_final=5, weight=0.5):
    """
    Performs hybrid search using BM25 followed by BERT re-ranking.

    Parameters:
    - query (str): user query string
    - ls (DataFrame): lyrics DataFrame
    - bm25_model (BM25Okapi): pre-built BM25 model
    - bert_model (SentenceTransformer): preloaded BERT model
    - bert_index (dict): doc_id -> embedding
    - top_k_bm25 (int): how many candidates to retrieve from BM25
    - top_k_final (int): how many final results to return
    - weight (float): weight of BERT in final score (0–1)

    Returns:
    - DataFrame of top results with combined ranking
    """
    # Step 1: BM25 Search
    bm25_tokens = query.lower().split()
    bm25_scores = bm25_model.get_scores(bm25_tokens)
    bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k_bm25]
    bm25_top_ids = ls.iloc[bm25_top_indices]['id'].tolist()
    
    # Step 2: BERT query embedding
    query_embedding = bert_model.encode(query, convert_to_numpy=True)

    # Step 3: BERT scores for BM25 top candidates
    bert_scores = []
    bm25_selected_scores = []
    for doc_id in bm25_top_ids:
        doc_embedding = bert_index[doc_id]
        sim = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        bert_scores.append(sim)
        bm25_selected_scores.append(bm25_scores[ls[ls['id'] == doc_id].index[0]])

    # Step 4: Normalize scores
    bert_scores = np.array(bert_scores)
    bm25_selected_scores = np.array(bm25_selected_scores)

    bert_norm = (bert_scores - bert_scores.min()) / (bert_scores.max() - bert_scores.min() + 1e-8)
    bm25_norm = (bm25_selected_scores - bm25_selected_scores.min()) / (bm25_selected_scores.max() - bm25_selected_scores.min() + 1e-8)

    # Step 5: Combine and sort
    combined_scores = weight * bert_norm + (1 - weight) * bm25_norm
    sorted_indices = np.argsort(combined_scores)[::-1][:top_k_final]
    final_ids = [bm25_top_ids[i] for i in sorted_indices]

    return ls[ls['id'].isin(final_ids)][['id', 'title', 'artist', 'lyrics']]


In [None]:
# Implement hybrid search
bm25_model = BM25Okapi([str(doc).split() for doc in ls['preprocessed_lyrics']])
bert_index, bert_model = build_bert_index(ls)

query = "lonely heartbreak in the rain"
results = hybrid_search(query, ls, bm25_model, bert_model, bert_index, weight=0.6)
print(results)

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

# Analyze results

In [None]:
#Analyze results
def analyze_results(results_df):
    comparison_counts = results_df['comparison'].value_counts()

    total_results = len(results_df)

    analysis_df = pd.DataFrame({
        "Count": comparison_counts,
        "Percentage": (comparison_counts / total_results) * 100
    })

    for status in ['Improved', 'Unchanged', 'Worsened']:
        if status not in analysis_df.index:
            analysis_df.loc[status] = [0, 0.0]  # Add missing status with 0 count and 0% percentage

    analysis_df = analysis_df.loc[['Improved', 'Unchanged', 'Worsened']]

    return analysis_df

# Augmented model: BM25 + Datamuse API + BERT

# References
https://medium.com/@bormotovk/hybrid-retrieval-combining-bert-and-bm25-for-enhanced-performance-4f6f80881c13
