<a href="https://colab.research.google.com/github/parimal-art/test_repo/blob/main/Task_5_Spotify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df = pd.read_csv(
    '/content/spotify_songs.csv',
    engine='python',
    on_bad_lines='skip'
)


In [None]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove any character that is not a letter or whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Replace multiple spaces with a single space and strip leading/trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stopword_list = stopwords.words('english')
    words = [word for word in text.split() if word not in stopword_list]

    # Initialize Snowball Stemmer for stemming words to their root form
    sb_stem = SnowballStemmer('english')
    words = [sb_stem.stem(word) for word in words]

    # Rejoin the list of words into a single string
    return ' '.join(words)


In [None]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [None]:
df['text'] = df['text'].astype(str).apply(clean_text)

df['text_length'] = df['text'].apply(len)

df.head()


Unnamed: 0,artist,song,link,text,text_length
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,look at her face its a wonderful face and it m...,700
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,take it easy with me please touch me gently li...,1275
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,ill never know why i had to go why i had to pu...,1366
3,ABBA,Bang,/a/abba/bang_20598415.html,making somebody happy is a question of give an...,1095
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,making somebody happy is a question of give an...,1111



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
# Get the list of cleaned song lyrics
texts = df['text'].tolist()

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = vectorizer.fit_transform(texts)


In [None]:
def find_best_match(query):
    """
    Given a lyric snippet (query), vectorize the query using the same TF-IDF vectorizer,
    compute cosine similarity with the TF-IDF matrix, and return the index and similarity score.
    """
    # Clean the query text using the same cleaning function
    query_clean = clean_text(query)
    query_vector = vectorizer.transform([query_clean])

    # Compute cosine similarity between the query vector and all song vectors
    similarities = cosine_similarity(query_vector, tfidf_matrix)

    # Identify the best matching song (highest similarity score)
    best_index = similarities.argmax()
    best_score = similarities[0, best_index]
    return best_index, best_score


In [None]:
# Example lyric snippet (modify as needed)
query_lyric = "my kind of girl without her im blue and if she ever"

# Find the best match for the provided lyric snippet
best_index, score = find_best_match(query_lyric)

# Retrieve the best matching song details
best_match = df.iloc[best_index]

print("Best Match:")
print("Song Row:", best_index)
print("Artist:", best_match['artist'])
print("Song:", best_match['song'])
print("Similarity Score:", score)
# print("Link:", best_match['link'])
# print("Lyrics:", best_match['text'])




Best Match:
Song Row: 0
Artist: ABBA
Song: Ahe's My Kind Of Girl
Similarity Score: 0.5179084534896136


In [None]:
def find_top_matches(query, top_n=3):
    """
    Given a lyric snippet, this function cleans and vectorizes the query,
    computes cosine similarity with the TF-IDF matrix, and returns the top N
    matching songs as a list of tuples (song_index, similarity_score).
    """
    # Clean the query text using the same cleaning function
    query_clean = clean_text(query)
    query_vector = vectorizer.transform([query_clean])

    # Compute cosine similarity between the query vector and all song vectors
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]

    # Get indices of the top N scores in descending order
    top_indices = np.argsort(similarities)[::-1][:top_n]
    top_scores = similarities[top_indices]

    return list(zip(top_indices, top_scores))

# Example lyric snippet (modify as needed)
# query_lyric = "sing us a song you're the piano man sing us a song tonight"

# Get the top three matches for the query lyric
top_matches = find_top_matches(query_lyric, top_n=3)

# Display the results
print("Top Three Matches:")
for idx, score in top_matches:
    match = df.iloc[idx]
    print("\nSong Row:", idx)
    print("Artist:", match['artist'])
    print("Song:", match['song'])
    # print("Link:", match['link'])
    # print("Lyrics:", match['text'])
    print("Similarity Score:", score)


Top Three Matches:

Song Row: 594
Artist: Allman Brothers Band
Song: Straight From The Heart
Similarity Score: 0.32584314036930184

Song Row: 970
Artist: Backstreet Boys
Song: Everytime I Close My Eyes
Similarity Score: 0.2858571839109592

Song Row: 35380
Artist: Hanson
Song: Every Word I Say
Similarity Score: 0.23685543270765752
