In [None]:
# Upgrade pip and install pipx
!pip install --quiet --upgrade pip
!pip install --quiet --user pipx
!pipx ensurepath

# Install system dependencies
!sudo apt-get update && sudo apt-get install -y python3.10-venv ffmpeg

# Install Python packages
!pip install --quiet yt_dlp transformers torch faiss-cpu

# Install insanely-fast-whisper
!/root/.local/bin/pipx install git+https://github.com/Vaibhavs10/insanely-fast-whisper.git

!sudo apt-get install ffmpeg

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.8 MB[0m [31m16.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[0m/bin/bash: line 1: pipx: command not found
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,185 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://r2u.stat.illinois

In [None]:
import os
import re
import json
import subprocess
from typing import List, Dict, Any, Optional

import numpy as np
import pandas as pd
from yt_dlp import YoutubeDL
from sentence_transformers import SentenceTransformer
import faiss
import torch

# Check if a CUDA-compatible GPU is available
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Define the LyricsEmbedding class
class LyricsEmbedding:
    """
    A class for generating semantic embeddings of song lyrics using the Sentence Transformers framework.

    This class uses the 'all-mpnet-base-v2' model, which is optimized for semantic similarity tasks.

    Attributes:
        model_name (str): Name of the pre-trained model to use. Defaults to 'sentence-transformers/all-mpnet-base-v2'
        model: The loaded SentenceTransformer model
    """

    def __init__(self, model_name: str = 'sentence-transformers/all-mpnet-base-v2'):
        """
        Initialize the LyricsEmbedding class with a specified model.

        Args:
            model_name (str): The name of the sentence-transformer model to use.
        """
        self.model = SentenceTransformer(model_name)
        self.model.to(DEVICE)

    def embed(self, lyrics: str) -> np.ndarray:
        """
        Generate embeddings for a single piece of lyrics.

        The method uses torch.no_grad() for efficiency during inference and normalizes
        the embeddings to unit length, which is crucial for cosine similarity comparisons
        in the FAISS index.

        Args:
            lyrics (str): The input lyrics text to embed

        Returns:
            np.ndarray: A 768-dimensional normalized embedding vector as float32.
                       float32 is used for compatibility with FAISS.
        """
        with torch.no_grad():
            embedding = self.model.encode(
                lyrics,
                convert_to_numpy=True,
                normalize_embeddings=True
            )
        return embedding.astype('float32')

    def batch_embed(self, lyrics_list: List[str]) -> np.ndarray:
        """
        Generate embeddings for multiple pieces of lyrics efficiently.

        Uses batching to process multiple lyrics simultaneously, which is significantly
        faster than processing them individually, especially on GPU.

        Args:
            lyrics_list (List[str]): List of lyrics texts to embed

        Returns:
            np.ndarray: A matrix of shape (n_lyrics, 768) containing the embeddings,
                       where each row is a normalized embedding vector.
        """
        with torch.no_grad():
            embeddings = self.model.encode(
                lyrics_list,
                batch_size=32,
                convert_to_numpy=True,
                normalize_embeddings=True
            )
        return embeddings.astype('float32')

# Clean lyrics
def clean_lyrics(lyrics: str) -> str:
    """
    Clean and standardize lyrics text to improve matching accuracy.

    This function applies several preprocessing steps:

    1. Removes metadata annotations: Eliminates text within square brackets like
       [Verse], [Chorus], [Producer], etc., as they don'tcontribute to the semantic
       meaning of the lyrics.

    2. Normalizes whitespace:
       - Reduces multiple consecutive newlines to single newlines to preserve
         some structure while eliminating excessive spacing
       - Replaces all types of whitespace (tabs, multiple spaces) with single spaces
         to ensure consistent formatting

    Args:
        lyrics (str): Raw lyrics text that may contain metadata, irregular spacing,
                     and other formatting elements

    Returns:
        str: Cleaned lyrics with consistent formatting and without metadata annotations
    """
    lyrics = re.sub(r'\[.*?\]', '', lyrics)
    lyrics = re.sub(r'(\n\s*\n)+', '\n', lyrics)
    lyrics = re.sub(r'\s+', ' ', lyrics)
    return lyrics.strip()

# Load and preprocess the lyrics dataset
def load_and_preprocess_dataset(file_path: str, top_n: int) -> pd.DataFrame:
    """
    Load and preprocess a large lyrics dataset efficiently using chunking.

    The function processes the dataset in chunks of 500,000 rows to handle
    large files without loading everything into memory at once. It selects
    the top_n songs by view count to create a manageable, high-quality subset.

    Args:
        file_path (str): Path to the CSV file containing the lyrics dataset
        top_n (int): Number of most viewed songs to keep

    Returns:
        pd.DataFrame: Processed dataset with columns ['title', 'artist', 'lyrics']
                     containing the top_n most viewed songs
    """
    chunksize = 500000
    top_views_df = pd.DataFrame()

    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        chunk['lyrics'] = chunk['lyrics'].fillna('').apply(clean_lyrics)
        chunk_top = chunk.nlargest(top_n, 'views')
        top_views_df = pd.concat([top_views_df, chunk_top])

    top_views_df = top_views_df.nlargest(top_n, 'views')
    top_views_df = top_views_df[['title', 'artist', 'lyrics']]
    return top_views_df.reset_index(drop=True)


# Create and save the FAISS index and metadata
def create_and_save_index(
    top_views_df: pd.DataFrame,
    index_path: str = "lyrics_index.faiss",
    metadata_path: str = "lyrics_metadata.json"
):
    """
    Create and save a FAISS similarity index for lyrics along with metadata.

    Uses FAISS IndexFlatIP (Inner Product) index which is optimized for cosine
    similarity searches.

    The function:
    1. Generates embeddings for all lyrics using the transformer model
    2. Creates a FAISS index from these embeddings
    3. Saves song metadata (titles, artists) separately for retrieval

    Args:
        top_views_df (pd.DataFrame): DataFrame with columns ['title', 'artist', 'lyrics']
        index_path (str): Path to save the FAISS index
        metadata_path (str): Path to save the JSON metadata
    """
    embedder = LyricsEmbedding()
    embeddings = embedder.batch_embed(top_views_df['lyrics'].tolist())

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    index.add(embeddings)

    faiss.write_index(index, index_path)

    metadata = {
        'titles': top_views_df['title'].tolist(),
        'artists': top_views_df['artist'].tolist(),
    }
    with open(metadata_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False, indent=2)


# Download and transcribe audio from YouTube
def get_lyrics_from_youtube_url(youtube_url: str) -> Optional[str]:
    """
    Download a YouTube video's audio and transcribe it to obtain lyrics.

    Uses a three-step process:
    1. Downloads audio using yt-dlp
    2. Transcribes using Insanely-Fast-Whisper
    3. Processes the transcription output into clean text

    Args:
        youtube_url (str): URL of the YouTube music video

    Returns:
        Optional[str]: Transcribed lyrics if successful, None if any step fails
    """
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': 'temp.%(ext)s',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
            }],
            'quiet': True
        }

        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([youtube_url])

        if not os.path.exists('temp.mp3'):
            raise FileNotFoundError("The MP3 file (temp.mp3) was not created successfully.")

        whisper_command = f'/root/.local/bin/insanely-fast-whisper --file-name temp.mp3'
        subprocess.run(whisper_command, shell=True, text=True)

        os.remove('temp.mp3')

        try:
            with open('output.json', 'r') as file:
                data = json.load(file)

            if "chunks" in data:
                lyrics = " ".join(chunk["text"] for chunk in data["chunks"])
            else:
                lyrics = data.get("text", "")

            with open('combined_lyrics.txt', 'w') as output_file:
                output_file.write(lyrics)

            return lyrics

        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            return None

    except Exception as e:
        print(f"Error processing URL: {e}")
        return None

# Main function to get covers
def get_covers(youtube_url: str, k: int = 5, model_name: str = 'sentence-transformers/all-mpnet-base-v2') -> List[Dict[str, Any]]:
    """
    Find potential cover songs by comparing lyrics similarity between a YouTube video
    and a database of songs.

    The function performs these steps:
    1. Extracts and transcribes lyrics from the YouTube video
    2. Converts lyrics to embeddings using the transformer model
    3. Performs similarity search against the FAISS index
    4. Returns the k most similar songs with similarity scores

    Args:
        youtube_url (str): URL of the YouTube video to analyze
        k (int): Number of similar songs to return (default: 5)
        model_name (str): Name of the sentence transformer model to use
                         (default uses all-mpnet-base-v2 for its strong performance
                         on semantic similarity tasks)

    Returns:
        List[Dict[str, Any]]: List of k dictionaries containing:
            - Title: Song title
            - Artist: Artist name
            - Score: Similarity score (0-100)
            Sorted by score in descending order

    """
    try:
        embedder = LyricsEmbedding(model_name)

        lyrics = get_lyrics_from_youtube_url(youtube_url)
        if not lyrics:
            raise ValueError("Failed to extract lyrics from YouTube video.")

        embedding = embedder.embed(lyrics)

        try:
            index = faiss.read_index("lyrics_index.faiss")
            with open('lyrics_metadata.json', 'r', encoding='utf-8') as f:
                metadata = json.load(f)
        except (FileNotFoundError, json.JSONDecodeError) as e:
            raise RuntimeError(f"Failed to load index or metadata: {str(e)}")

        similarities, indices = index.search(embedding.reshape(1, -1), k)

        results = []
        for similarity, idx in zip(similarities[0], indices[0]):
            score = ((similarity + 1) / 2) * 100
            results.append({
                "Title": metadata['titles'][idx],
                "Artist": metadata['artists'][idx],
                "Score": round(score, 1)
            })

        return results

    except Exception as e:
        print(f"Error in get_covers: {str(e)}")
        return []

  from tqdm.autonotebook import tqdm, trange


In [None]:
import kagglehub
import os

if __name__ == "__main__":
    # Load and preprocess the dataset
    dataset_directory = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information")
    csv_file_path = os.path.join(dataset_directory, 'song_lyrics.csv')
    top_views_df = load_and_preprocess_dataset(csv_file_path, top_n=1000)

    # Create and save the index
    create_and_save_index(top_views_df)



Downloading from https://www.kaggle.com/api/v1/datasets/download/carlosgdcj/genius-song-lyrics-with-language-information?dataset_version_number=1...


100%|██████████| 3.04G/3.04G [00:52<00:00, 62.5MB/s]

Extracting files...



Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Video 1
youtube_url = 'https://www.youtube.com/watch?v=BDC8Jr-gp_4'
k = 5
covers = get_covers(youtube_url, k)
print("Top matches:")
for cover in covers:
  print(f"{cover['Title']} by {cover['Artist']} (Score: {cover['Score']})")

Top matches:
Shape of You by Ed Sheeran (Score: 97.5)
Perfect by Ed Sheeran (Score: 83.8)
Luis Fonsi  Daddy Yankee - Despacito Remix ft. Justin Bieber English Translation by Genius English Translations (Score: 83.7)
Thinking Out Loud by Ed Sheeran (Score: 83.2)
Perfect Duet by Ed Sheeran & Beyonc (Score: 83.0)


In [None]:
# Video 2
youtube_url = 'https://www.youtube.com/watch?v=W_97b97G5ds'
covers = get_covers(youtube_url, k)
print("Top matches:")
for cover in covers:
  print(f"{cover['Title']} by {cover['Artist']} (Score: {cover['Score']})")

Top matches:
Believer by Imagine Dragons (Score: 97.8)
BTS - Magic Shop English Translation by Genius English Translations (Score: 83.6)
Bitch Dont Kill My Vibe by Kendrick Lamar (Score: 83.5)
​my tears ricochet by Taylor Swift (Score: 83.4)
I spoke to the devil in miami he said everything would be fine by XXXTENTACION (Score: 83.0)


In [None]:
# Video 3
youtube_url = 'https://www.youtube.com/watch?v=L53MZzuE0QY'
k = 5
covers = get_covers(youtube_url, k)
print("Top matches:")
for cover in covers:
  print(f"{cover['Title']} by {cover['Artist']} (Score: {cover['Score']})")

Top matches:
Rap God by Eminem (Score: 87.1)
Duppy Freestyle by Drake (Score: 85.0)
Homicide by Logic (Score: 85.0)
Greatest by Eminem (Score: 84.6)
N.Y. State of Mind by Nas (Score: 84.3)


In [None]:
# Video 4
youtube_url = 'https://www.youtube.com/watch?v=9vmrPrYJPqI'
k = 5
covers = get_covers(youtube_url, k)
print("Top matches:")
for cover in covers:
  print(f"{cover['Title']} by {cover['Artist']} (Score: {cover['Score']})")

Top matches:
Get Lucky by Daft Punk (Score: 93.7)
Circles by Post Malone (Score: 77.4)
All Star by Smash Mouth (Score: 76.5)
The Greatest Show by Hugh Jackman, Keala Settle, Zac Efron, Zendaya & The Greatest Showman Ensemble (Score: 76.3)
Rewrite the Stars by Zac Efron (Score: 76.2)


In [None]:
# Video 5
youtube_url = 'https://www.youtube.com/watch?v=R6ATpAr7rQU'
k = 5
covers = get_covers(youtube_url, k)
print("Top matches:")
for cover in covers:
  print(f"{cover['Title']} by {cover['Artist']} (Score: {cover['Score']})")

Top matches:
Get Lucky by Daft Punk (Score: 86.1)
MONEY LONG by kizaru (Score: 80.0)
One Dance by Drake (Score: 76.7)
All Star by Smash Mouth (Score: 76.5)
​through the late night by Travis Scott (Score: 76.1)


In [None]:
# Video 6
youtube_url = 'https://www.youtube.com/watch?v=RmtP8X4ZErs'
covers = get_covers(youtube_url, k)
print("Top matches:")
for cover in covers:
  print(f"{cover['Title']} by {cover['Artist']} (Score: {cover['Score']})")

Top matches:
Bohemian Rhapsody by Queen (Score: 93.6)
Sing About Me Im Dying of Thirst by Kendrick Lamar (Score: 80.4)
Ride by twenty one pilots (Score: 79.6)
Pink  White by Frank Ocean (Score: 79.4)
When Im Gone by Eminem (Score: 78.8)


In [None]:
# Video 7
youtube_url = 'https://www.youtube.com/watch?v=DfMnRP0pk3A'
k = 5
covers = get_covers(youtube_url, k)
print("Top matches:")
for cover in covers:
  print(f"{cover['Title']} by {cover['Artist']} (Score: {cover['Score']})")

Top matches:
The Hills by The Weeknd (Score: 92.8)
Star Shopping by Lil Peep (Score: 86.6)
PRBLMS by 6LACK (Score: 86.1)
Bad Things by Machine Gun Kelly & Camila Cabello (Score: 85.9)
I Feel It Coming by The Weeknd (Score: 85.6)


In [None]:
# Video 8
youtube_url = 'https://www.youtube.com/watch?v=1BVP72VrGQs'
k = 5
covers = get_covers(youtube_url, k)
print("Top matches:")
for cover in covers:
  print(f"{cover['Title']} by {cover['Artist']} (Score: {cover['Score']})")

Top matches:
Amorfoda by Bad Bunny (Score: 83.5)
Tuyo by Rodrigo Amarante (Score: 83.4)
Despacito by Luis Fonsi (Score: 82.9)
Tú No Metes Cabra by Bad Bunny (Score: 79.4)
Mi Gente by J Balvin & Willy William (Score: 78.3)
