In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import re

In [4]:
!pip install scikit-learn




In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
!pip install nltk




In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import nltk

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\milip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords


In [11]:
df = pd.read_csv("Spotify Million Song Dataset_exported.csv")

In [12]:
df = df[['artist', 'song', 'text']]

In [13]:
df.dropna(inplace=True)

In [14]:
stop_words = set(stopwords.words('english'))

In [15]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

In [16]:
df['clean_lyrics'] = df['text'].apply(clean_text)

In [17]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [18]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['clean_lyrics'])

In [19]:
def predict_song_snippet(snippet):
    snippet = clean_text(snippet)
    snippet_vec = vectorizer.transform([snippet])

    similarity = cosine_similarity(snippet_vec, tfidf_matrix)
    index = similarity.argmax()

    song = df.iloc[index]['song']
    artist = df.iloc[index]['artist']

    return song, artist



In [20]:
user_input = input("\nEnter a lyric snippet: ")
song, artist = predict_song_snippet(user_input)
print("\n Song Found!")
print("Song:", song)
print("Artist:", artist)


Enter a lyric snippet:  Look at her face, it's a wonderful face   And it means something special to me   Look at the way that she smiles when she sees me   How lucky can one fellow be? 



 Song Found!
Song: Ahe's My Kind Of Girl
Artist: ABBA


In [23]:
def check_accuracy(samples=100):
    correct = 0
    
    random_rows = np.random.choice(len(df), samples, replace=False)
    
    for i in random_rows:
        lyrics = df.iloc[i]['clean_lyrics']
        words = lyrics.split()
        
        snippet = " ".join(words[:30])  # first 5 words
        
        actual_song = df.iloc[i]['song']
        predicted_song, _ = predict_song_snippet(snippet)
        
        if actual_song == predicted_song:
            correct += 1
    
    return correct / samples


In [24]:
accuracy = check_accuracy(100)
print(" Model Accuracy:", round(accuracy * 100, 2), "%")


 Model Accuracy: 94.0 %
