In [6]:
from langchain_community.document_loaders import TextLoader #this will convert our descriptions into something langchain can understand
from langchain_text_splitters import CharacterTextSplitter # will basically divide the document created into individual descriptions
from langchain_huggingface import HuggingFaceEmbeddings # an open source model we will use to convert text into numerical vectors
from langchain_chroma import Chroma # a vector database to store our vectors

In [3]:
import pandas as pd

In [4]:
# Since textloader doesn't read pandas' dataframes, so we convert the column we are going to use to make the vector database into a text file
songs = pd.read_csv("dataset/final_track_list.csv")
songs["tagged_description"].to_csv("dataset/tagged_description.txt", index=False, header=False)

In [7]:
raw_doc = TextLoader("dataset/tagged_description.txt").load()
# initialize splitter
splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
# chunk overlap and separator and their values are self-explanatory, for chunk size we set it to zero because we want to prioritize splitting on \n, if we set it to some non-zero size, maybe that size comprised of multiple lines, so splitter wouldn't care for \n.

#apply splitter on the document
doc = splitter.split_documents(raw_doc)

Created a chunk of size 308, which is longer than the specified 0
Created a chunk of size 298, which is longer than the specified 0
Created a chunk of size 337, which is longer than the specified 0
Created a chunk of size 316, which is longer than the specified 0
Created a chunk of size 337, which is longer than the specified 0
Created a chunk of size 314, which is longer than the specified 0
Created a chunk of size 301, which is longer than the specified 0
Created a chunk of size 263, which is longer than the specified 0
Created a chunk of size 289, which is longer than the specified 0
Created a chunk of size 289, which is longer than the specified 0
Created a chunk of size 258, which is longer than the specified 0
Created a chunk of size 281, which is longer than the specified 0
Created a chunk of size 261, which is longer than the specified 0
Created a chunk of size 278, which is longer than the specified 0
Created a chunk of size 272, which is longer than the specified 0
Created a 

In [8]:
# checking if splitting was done correctly
doc[0]

Document(metadata={'source': 'dataset/tagged_description.txt'}, page_content='"2ZzZ2qwZBWlDKs420hhloc A laid-back desi-pop groove with mellow rap verses and smooth vocals. Warm guitar licks and a head-nodding beat lend moderate danceability, while lyrics about cyclical love tug at the heart. Feels reflective and subtly upliftingâ€”perfect for late-night drives or chilled gatherings."')

In [9]:
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [10]:
# Making the vector database
db_songs = Chroma.from_documents(doc,embedding=embedder)

This is, believe it or not pretty much it for the basic analyser

In [11]:
# Test
query = "An indie song about heartbreak"
response = db_songs.similarity_search(query, k = 5)

In [12]:
response

[Document(id='44428aa5-5242-4723-8214-7e513749c951', metadata={'source': 'dataset/tagged_description.txt'}, page_content='"5pMmWfuL0FTGshYt7HVJ8P Classic, dramatic pop. Emotional, catchy, and a little desperate, itâ€™s a song for singing along, dancing, and feeling every word of heartbreak."'),
 Document(id='1ad8f097-5c91-495c-b930-85fa7acd42ab', metadata={'source': 'dataset/tagged_description.txt'}, page_content='"3JvKfv6T31zO0ini8iNItO Heart-wrenching indie ballad. Raw, emotive vocals and piano build a storm of longing and loss. Perfect for cathartic crying, late-night reflection, or singing your pain out."'),
 Document(id='f7b1e52d-24b5-4ca7-883c-bd4314d97172', metadata={'source': 'dataset/tagged_description.txt'}, page_content='751srcHf5tUqcEa9pRCQwP Bittersweet alt-pop about emotional detachment. The upbeat melody contrasts aching lyricsâ€”like smiling through heartbreak. A car-ride cry-dance anthem.'),
 Document(id='3899abba-b5ec-4534-a4fd-271132b85173', metadata={'source': 'data

Alright now that this is working fine, we need to use the id we previously attached to descriptions to fetch the actual songs and return the songs instead.

In [15]:
songs[songs["id"] == response[0].page_content.split()[0].strip('"')]

"5pMmWfuL0FTGshYt7HVJ8P


Unnamed: 0,id,name,artist,album,release_date,duration_ms,genres,popularity,explicit,description,Unnamed: 10,tagged_description
553,5pMmWfuL0FTGshYt7HVJ8P,SOS,ABBA,ABBA Gold,2008-01-01,201360,,68,False,"Classic, dramatic pop. Emotional, catchy, and ...",,"5pMmWfuL0FTGshYt7HVJ8P Classic, dramatic pop. ..."


Now that we have configured fetching a song, we can make a function to fetch however many songs we need to fetch in a bundle.

In [22]:
def fetch_songs(query: str, size: int=10) -> pd.DataFrame:
    response = db_songs.similarity_search(query, k=size)
    result = []
    for i in range(0,len(response)):
        result+= [response[i].page_content.split()[0].strip('"')]
    return songs[songs["id"].isin(result)]

In [23]:
fetch_songs("mellow songs for a late night drive", 5)

Unnamed: 0,id,name,artist,album,release_date,duration_ms,genres,popularity,explicit,description,Unnamed: 10,tagged_description
101,5mCPDVBb16L4XQwDdbRUpz,Passionfruit,Drake,More Life,2017-03-18,298940,rap,87,True,"Mellow, moody R&B from Drake. Laid-back tropic...",,"5mCPDVBb16L4XQwDdbRUpz Mellow, moody R&B from ..."
294,0WQiDwKJclirSYG9v5tayI,There Is a Light That Never Goes Out - 2011 Re...,The Smiths,The Queen Is Dead,1986-06-16,244586,"jangle pop, madchester, new wave",84,False,"Melancholic, anthemic indie pop. Yearning, dra...",,"0WQiDwKJclirSYG9v5tayI Melancholic, anthemic i..."
319,3kxkjirben9RVm9NqYa6rm,Black Out Days - Future Islands Remix,"Phantogram, Future Islands",Black Out Days (Future Islands Remix),2017-09-01,248346,,72,False,"Brooding, synth-heavy indie pop. Dark, energet...",,"3kxkjirben9RVm9NqYa6rm Brooding, synth-heavy i..."
333,37oU0liybMY8aI9u6QMVM9,Inside of Love,Nada Surf,Let Go,2002,298533,power pop,52,False,"Melancholic, melodic indie rock. Tender, hones...",,"37oU0liybMY8aI9u6QMVM9 Melancholic, melodic in..."
893,3siwsiaEoU4Kuuc9WKMUy5,No One Noticed,The Marías,Submarine,2024-05-31,236906,bedroom pop,93,False,A dreamy track combining soft vocals with lush...,,3siwsiaEoU4Kuuc9WKMUy5 A dreamy track combinin...


Damn okay these responses are way better than what i expected. Spotify can learn a thing or two from me 