In [1]:
! pip install -r requirements.txt



In [19]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import JSONLoader
from langchain.schema import Document

import pandas as pd
import re

In [3]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["song"] = record.get("song")
    metadata["song_id"] = record.get("song_id")
    return metadata

In [4]:
model = OllamaLLM(model="llama3.1", temperature=0.0)
embedder = OllamaEmbeddings(model="llama3.1")

loader = JSONLoader(
    file_path='./data/songs.json',
    jq_schema='.[]',
    content_key="lyrics",
    metadata_func=metadata_func
)

In [5]:
docs = loader.load()
faiss_db_path = "data/test_DB"

print(f"Page content is: {docs[0].page_content}")
print("Metadata:")
for key, item in docs[0].metadata.items():
    print(key, item)


Page content is: Better than home Lyrics I can feel my body breathing I can feel my heart is moving fast I am not afraid or lonely I am not chasing the ghosts of the past I have found the place where hunger Meets the edge and now I'm facing God I won't dare look into his eyes I can only hang my head to the ground When I try and open my mouth There's so many words but there's no sound And the angel comes upon me And the love I feel's a love I've never known And it's better than home Better than home Out on this long and winding road Chasing the sound with my friends And we ain't never rolling back again And who knows what we're gonna find Knows what we're gonna see Knows how we're gonna change Knows what we're gonna be Baby it's just you and me So keep driving on Cause this is better than home Everything inside me is stirring In every corner of every secret wound In the light I am adoring Feel like flying in circles around the moon When I gaze upon the world I know that I'm a part of so

In [6]:
# Truncating the number of songs:
truncated_docs = docs[:25]

In [7]:
# vector_db = FAISS.from_documents(truncated_docs, embedder)

In [None]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text.strip()  # Trim extra spaces


def get_summary(page_content):
    summary = model.invoke([
            SystemMessage(content="Summarise the lyrics of this song in a list of 5 words in this format: Sad, Regret, Betrayal. The output should only contain the five words")
        ] + [
            HumanMessage(content=page_content)
        ])
    return clean_text(summary)


def get_genres(page_content):
    genres = model.invoke([
            SystemMessage(content="Extract a list of up to 5 genres of this song in this format: Pop, Rock, Folk. The output should only contain the five words")
        ] + [
            HumanMessage(content=page_content)
        ]).strip()
    return clean_text(genres)


def update_entry(song_doc, summary, genres, df_dict):
    song_id = song_doc.metadata.get("song_id")
    
    song_doc.metadata.update({
        "summary": summary,
        "genres": genres,
        "song_id": song_id
        })
    
    df_dict["song_id"].append(song_id)
    df_dict["summary"].append(summary)
    df_dict["genres"].append(genres)


def generate_vector_df(docs):
    df_dict = {
        "song_id": [],
        "summary": [],
        "genres": [],
    }

    for idx, doc in enumerate(docs):
        update_entry(
            doc,
            summary=get_summary(doc.page_content),
            genres=get_genres(doc.page_content),
            df_dict=df_dict,
        )
        
        print(idx, doc.metadadta.song_id, sep="\t")

    vector_df = pd.DataFrame(df_dict)
    
    return vector_df
    

In [None]:
songs_df = generate_vector_df(truncated_docs)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


Unnamed: 0,song_id,summary,genres
0,spotify:track:1NbzT9otocflZ9d8yvA54D,freedom joy acceptance love escape,pop rock folk country americana
1,spotify:track:1crTf4RvraCjcPoqsaT9hd,anger betrayal confusion desperation madness,hip hop rap gangsta aggressive angry
2,spotify:track:4AGkBc4BiGLQNBdTUmUeF3,pain longing heartache suffering despair,pop country folk ballad acoustic
3,spotify:track:4jtIeveEJPs8SxuVa3FV21,regret betrayal frustration disappointment regret,hip hop rap pop rb gangsta
4,spotify:track:5Dbr7Fy5VhSTXRbt2h3EYF,sad regret betrayal heartache longing,pop rock soul blues gospel
5,spotify:track:6QhsQvMgEuqJE6z7re1ijE,deceit betrayal sadness regret deception,rock blues country folk americana
6,spotify:track:2UuJOEfAUXcD3dYb7eEEZu,sad regret betrayal isolation desperation,hip hop rap alternative rock pop
7,spotify:track:1z5E84ZZQP5o0k00oqpt13,loneliness regret betrayal sadness isolation,indie alternative rock pop britpop
8,spotify:track:2A50XhjzbW2JApFXochIDl,loss guilt redemption faith salvation,rock gospel soul blues funk
9,spotify:track:1GUj8To0eE4iV0qeZGNH6G,mistakes regret betrayal pain suffering,pop rock blues soul acoustic


In [24]:
def generate_embeddings(songs_df, embedder):
    songs_df["combined_text"] = songs_df["summary"] + " " + songs_df["genres"]

    # Generate embeddings in batch
    embeddings = embedder.embed_documents(songs_df["combined_text"].tolist())

    # Store the embeddings in the DataFrame
    songs_df["embedding"] = embeddings


def get_vector_db(songs_df, embedder):
    songs_df["combined_text"] = songs_df["summary"] + " " + songs_df["genres"]
    
    # Convert to a list of LangChain Document objects (each with metadata)
    docs = [
        Document(page_content=row["combined_text"], metadata={"song_id": row["song_id"]})
        for _, row in songs_df.iterrows()
    ]

    vector_db = FAISS.from_documents(docs, embedder)
    vector_db.save_local("faiss_index")

    return vector_db


def query(vector_db):
    words = input("gimme 3 words: ")
    print(f"You asked for songs with themes like: {words}")

    results = vector_db.similarity_search(words, k=5)
    retrieved_song_ids = [doc.metadata["song_id"] for doc in results]
    retrieved_songs = songs_df[songs_df["song_id"].isin(retrieved_song_ids)]

    print(retrieved_song_ids)


In [25]:
vector_db = get_vector_db(
    songs_df=songs_df,
    embedder=embedder
)

In [26]:
query(vector_db=vector_db)

You asked for songs with themes like: jpop sadness 
['spotify:track:6QhsQvMgEuqJE6z7re1ijE', 'spotify:track:4AGkBc4BiGLQNBdTUmUeF3', 'spotify:track:1OqDvYVDhJyFZny7XlfIyZ', 'spotify:track:1z5E84ZZQP5o0k00oqpt13', 'spotify:track:2A50XhjzbW2JApFXochIDl']
