In [2]:
import pandas as pd

# Reading dataset
songgs = pd.read_csv('ds2.csv')

In [3]:
# Printing first 10 songs
songgs.head(10)

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6
5,Lollipop Remix,rap,Lil Wayne,2008,580832,"{""Kanye West"",""Static Major""}",[Intro: Lil Wayne]\nHaha\nUh-huh\nNo homo (You...,7
6,Im Not You,rap,Clipse,2002,28645,"{Jadakiss,""Styles P"",""Roscoe P. Coldchain""}","[Intro: Pusha T]\nNo, no, no!\nI told you, I l...",8
7,Family Ties,rap,Cam'ron,2004,41960,"{""Cam\\'ron"",""Lady Wray""}","[Verse 1: Cam'ron]\nKilla, Dipset\nMan I spit ...",9
8,Rockin and Rollin,rap,Cam'ron,1998,6399,"{""Cam\\'ron""}",[Verse 1]\nAy yo you wonder who I are\nI guzzl...,10
9,Lord You Know,rap,Cam'ron,2004,11882,"{""Cam\\'ron"",""Juelz Santana"",Jaheim}","[Chorus: Jaheim]\nNow Lord you know, just how ...",11


In [4]:
len(songgs) # number of songs in dataset

5913411

In [19]:
songs = songgs[songgs['views']>= 20000]  # extracting only popular songs (with view count greater than 20k)

In [20]:
import re

# Text cleansing function
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)  # Convert to string if not already
    text = text.replace('\n', ' ')  # Replace newline characters with spaces
    text = re.sub(r'\[.*?\]', '', text)  # Remove annotations
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply the cleaning function to the lyrics
songs['lyrics'] = songs['lyrics'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  songs['lyrics'] = songs['lyrics'].apply(clean_text)


In [72]:
songs[:10]['lyrics']  # verifying that cleansing was successful

0      killa cam killa cam cam killa cam killa cam k...
1        yeah hah yeah rocafella we invite you to so...
3        ugh killa baby kanye this that s heron flow...
4      so they ask me young boy what you gon do the ...
5      haha uhhuh no homo young mula baby i say hes ...
6      no no no i told you i lived this shit i aint ...
7      killa dipset man i spit that pimp talk you ha...
10     yeah money on my mind money on my mind yeah m...
12       dear god  i wonder can you save me illumina...
13       shawn carter was born december th weighing ...
Name: lyrics, dtype: object

In [71]:
# Summarization of the song by its id
from transformers import pipeline

summarizer = pipeline("summarization", model="Falconsai/text_summarization")

# ARTICLE = songs.loc[1, 'lyrics']
ARTICLE = songs.loc[songs['id'] == 614, 'lyrics'].values[0]
# print(len(ARTICLE))
print(summarizer(ARTICLE, max_length=50, min_length=30, do_sample=False))


Token indices sequence length is longer than the specified maximum sequence length for this model (1063 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': 'i know i can be what i wanna be if i work hard at it ill be where i want to be . you can be anything in the world in god we trust an architect doctor maybe an actress'}]


In [56]:
from sentence_transformers import SentenceTransformer
import faiss

def find_similar_songs(songs_df, input_prompt, n=5):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Drop rows with missing lyrics
    songs_df = songs_df.dropna(subset=['lyrics'])

    # Extract song lyrics and IDs
    lyrics_sentences = songs_df['lyrics'].tolist()
    song_ids = songs_df['id'].tolist()  # Assuming the DataFrame has a column 'id' for song IDs

    # Compute embeddings for song lyrics
    lyrics_embeddings = model.encode(lyrics_sentences, convert_to_tensor=True, batch_size=1000)

    # Compute embedding for the input prompt
    prompt_embedding = model.encode([input_prompt], convert_to_tensor=True, batch_size=1)

    # Convert embeddings to numpy arrays
    lyrics_embeddings_np = lyrics_embeddings.cpu().numpy()
    prompt_embedding_np = prompt_embedding.cpu().numpy()

    # Normalize embeddings to use cosine similarity
    faiss.normalize_L2(lyrics_embeddings_np)
    faiss.normalize_L2(prompt_embedding_np)

    # Use FAISS for efficient nearest neighbor search
    d = lyrics_embeddings_np.shape[1]
    index = faiss.IndexFlatIP(d)  # Index for inner product (cosine similarity)
    index.add(lyrics_embeddings_np)  # Add lyrics embeddings to the index

    # Perform search
    D, I = index.search(prompt_embedding_np, n)  # Search for top n nearest neighbors

    # Collect top similarities
    top_similarities = []
    for j in range(n):
        top_similarities.append((song_ids[I[0, j]], D[0, j]))

    # Create a DataFrame from the collected data
    result_df = pd.DataFrame(top_similarities, columns=['song_id', 'score'])

    return result_df

In [70]:
input_prompt = "I want to dance, I feel so energetic"
n = 3
sngs = songs[:500]  # taking only 500 songs due to computational time

similar_songs_df = find_similar_songs(sngs, input_prompt, n)
similar_songs = pd.merge(similar_songs_df, songs, how='left', left_on='song_id', right_on='id')
similar_songs

Unnamed: 0,song_id,score_x,title,tag,artist,year,views,features,lyrics,id,score_y
0,558,0.310672,Must Be the Ganja,rap,Eminem,2009,215816,{},yeah ohh yeah yeah ohh i feel like dancing i ...,558,0.009242
1,80,0.282379,We Fly High,rap,Jim Jones,2006,63337,{},i wear a mean dark pair of shades and you can...,80,0.002712
2,614,0.270811,I Can,rap,Nas,2003,336049,{},i know i can be what i wanna be if i work har...,614,0.014391
