In [1]:
# ------------------ Import libraries ------------------
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np
from textblob import TextBlob

In [2]:
# Load dataset
df = pd.read_csv("spotify_millsongdata.csv")
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')  # <-- NEW in NLTK >= 3.9

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nitis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nitis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

# Verify tokenizer works
sample = "This is a test sentence."
tokens = word_tokenize(sample)
print(tokens)  # Should print: ['This', 'is', 'a', 'test', 'sentence', '.']

# Preprocessing function
stemmer = PorterStemmer()
def tokenization(txt):
    tokens = word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

# Apply to your dataframe
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex=True)
df['text'] = df['text'].apply(lambda x: tokenization(x))

['This', 'is', 'a', 'test', 'sentence', '.']


In [8]:
# ------------------ Generate Lyrics Embeddings ------------------
model = SentenceTransformer('all-MiniLM-L6-v2')
lyrics_embeddings = model.encode(df['text'].tolist(), convert_to_numpy=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
# ------------------ Precompute Context Columns ------------------

# ---- Mood Detection ----
def detect_mood(text):
    text = text.lower()
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    # Energetic keywords
    energetic_keywords = ["dance", "move", "jump", "run", "energy", "party"]
    if any(word in text for word in energetic_keywords):
        return "Energetic"
    elif polarity > 0.3:
        return "Happy"
    elif polarity < -0.2:
        return "Sad"
    else:
        return "Chill"

df['mood'] = df['text'].apply(detect_mood)

In [10]:
# ---- Activity Detection ----
def detect_activity(text):
    text = text.lower()
    if any(w in text for w in ["dance", "move", "run", "jump", "energy"]):
        return "Workout"
    if any(w in text for w in ["study", "calm", "focus", "read", "think"]):
        return "Study"
    if any(w in text for w in ["party", "club", "celebrate", "night"]):
        return "Party"
    if any(w in text for w in ["relax", "chill", "slow", "peace"]):
        return "Relax"
    return "Any"

df['activity'] = df['text'].apply(detect_activity)

In [11]:
# ---- Genre Detection ----
def detect_genre(text):
    text = text.lower()
    genres = ["pop", "rock", "hip-hop", "jazz", "classical", "electronic", "country"]
    for g in genres:
        if g in text:
            return g.capitalize()
    return "Any"

df['genre'] = df['text'].apply(detect_genre)

In [12]:
# ------------------ Build Similarity Matrix ------------------
similarity_matrix = cosine_similarity(lyrics_embeddings)

In [13]:
# ------------------ Recommendation Function ------------------
def recommend_with_context(input_song, df, similarity_matrix, top_n=5, mood="Any", activity="Any", genre="Any"):
    try:
        idx = df[df['song'] == input_song].index[0]
    except IndexError:
        print(f"Song '{input_song}' not found in the dataset.")
        return []

    sim_scores = similarity_matrix[idx]
    top_indices = np.argsort(sim_scores)[::-1][1:50]  # top 50 candidates

    recommendations = []
    for i in top_indices:
        song_name = df.iloc[i].song

        # Use precomputed columns
        if mood != "Any" and df.iloc[i]['mood'] != mood:
            continue
        if activity != "Any" and df.iloc[i]['activity'] != activity:
            continue
        if genre != "Any" and df.iloc[i]['genre'] != genre:
            continue

        recommendations.append(song_name)
        if len(recommendations) == top_n:
            break

    # Fallback if not enough matches
    if len(recommendations) < top_n:
        for i in top_indices:
            song_name = df.iloc[i].song
            if song_name not in recommendations:
                recommendations.append(song_name)
            if len(recommendations) == top_n:
                break

    return recommendations

In [15]:
# ------------------ Test the Recommendation ------------------
selected_song = 'Hey Jude'
recommended_songs = recommend_with_context(
    selected_song, df, similarity_matrix, top_n=5, mood="Any", activity="Any", genre="Any"
)

print(f"Recommendations for '{selected_song}':")
print(recommended_songs)

Recommendations for 'Hey Jude':
['Another Nail In My Heart', 'You Got It', 'Honey', 'Four Letter Word', 'Hurts So Good']


In [16]:
# ------------------ Save Data for app.py ------------------
with open("df.pkl", "wb") as f:
    pickle.dump(df, f)

with open("similarity.pkl", "wb") as f:
    pickle.dump(similarity_matrix, f)

print("Pickle files saved: df.pkl and similarity.pkl")

Pickle files saved: df.pkl and similarity.pkl
