In [None]:
# in_google_colab = 1
# if in_google_colab:
#   !pip install faiss-cpu

In [1]:
from sentence_transformers import SentenceTransformer
import faiss

import pandas as pd
import numpy as np
from pathlib import Path
import tomllib

## Step 0 - Load Data

In [2]:
data_path = Path("data/songs.csv")

In [3]:
lyrics = pd.read_csv(data_path)

In [4]:
lyrics.head()

Unnamed: 0,Artist,Title,Lyrics
0,Taylor Swift,cardigan,"Vintage tee, brand new phone\nHigh heels on co..."
1,Taylor Swift,exile,"I can see you standing, honey\nWith his arms a..."
2,Taylor Swift,Lover,We could leave the Christmas lights up 'til Ja...
3,Taylor Swift,the 1,"I'm doing good, I'm on some new shit\nBeen say..."
4,Taylor Swift,Look What You Made Me Do,I don't like your little games\nDon't like you...


In [5]:
lyrics.shape

(745, 3)

In [6]:
lyrics['Artist'].unique()

array(['Taylor Swift', 'Billie Eilish', 'The Beatles', 'David Bowie',
       'Billy Joel', 'Ed Sheeran', 'Eric Clapton', 'Bruce Springsteen',
       'Vance Joy', 'Lana Del Rey', 'Bryan Adams', 'Leonard Cohen',
       'Nat King Cole', 'twenty one pilots', 'Ray LaMontagne',
       'Bob Dylan', 'John Denver', 'Frank Sinatra', 'Queen', 'Elton John',
       'George Michael'], dtype=object)

## Step 0.5 - Chunking

* By paragraph? By line? By song?
* Check token limit
* etc

## Step 1 - Create Embeddings

In [7]:
models_possible = [
    'nomic-embed-text',
    'all-mpnet-base-v2'
]

model_name = models_possible[1]
model = SentenceTransformer(model_name)

In [8]:
embeddings = model.encode(lyrics['Lyrics'])

In [9]:
type(embeddings)

numpy.ndarray

In [10]:
embeddings.shape

(745, 768)

In [11]:
model.similarity(embeddings[0:5], embeddings[0:5])

tensor([[1.0000, 0.4468, 0.6080, 0.5934, 0.5363],
        [0.4468, 1.0000, 0.5352, 0.5242, 0.6023],
        [0.6080, 0.5352, 1.0000, 0.5728, 0.4673],
        [0.5934, 0.5242, 0.5728, 1.0000, 0.5427],
        [0.5363, 0.6023, 0.4673, 0.5427, 1.0000]])

## Step 2 - Visualization for Intuition

* Heatmap
* dim-red

## Step 3 - Create Index

In [12]:
# We need the dimension of our embeddings
d_emb = len(embeddings[0])

In [13]:
# Create a faiss index
faiss_index = faiss.IndexFlatL2(d_emb)
print(faiss_index.is_trained)
print(faiss_index.ntotal)

True
0


## Step 4 - Add embeddings to index

In [14]:
faiss_index.add(embeddings)

In [15]:
print(faiss_index.is_trained)
print(faiss_index.ntotal)

True
745


## Step 5 - Search

In [16]:
# Make a query, and embed it
query_1 = "Life is good and I will survive. I am happy that things turned out this way"
query_2 = "Why did you leave me? I am so sad. The world is so cruel."

In [17]:
qemb_1 = model.encode(query_1)
qemb_2 = model.encode(query_2)

Let's find the 4 closest songs to each query.

**Note**

FAISS expects a 2-d numpy array as input to its search. If we only have one query, we must reshape it. Or in our case, since we have two queries, we can just put them together:

In [18]:
qembs = np.stack([qemb_1, qemb_2], axis=0)

In [19]:
qembs.shape

(2, 768)

In [20]:
k = 4
distances_q_matched, indices_q_matched = faiss_index.search(qembs, k)

In [21]:
distances_q_matched

array([[1.1402757, 1.1712399, 1.3248088, 1.3391268],
       [1.1180034, 1.1372833, 1.164562 , 1.1772972]], dtype=float32)

In [22]:
indices_q_matched

array([[294, 608, 566, 621],
       [ 76, 682,  82, 657]])

In [25]:
# Let's look at matched songs for the first query:
for i in indices_q_matched[0]:
    artist = lyrics['Artist'].iloc[i]
    title = lyrics['Title'].iloc[i]
    song_lyrics = lyrics['Lyrics'].iloc[i]
    print(f"Matched song to your query \'{query_1}\':\nArtist: {artist}\nTitle: {title}\nLyrics:{song_lyrics}")

Matched song to your query 'Life is good and I will survive. I am happy that things turned out this way':
Artist: Eric Clapton
Title: I’ll Be Alright
Lyrics:I'll be alright
I'll be alright
I'll be alright someday
If in my heart
I do not give
Then I'll be alright someday
I'll overcome
I'll overcome
I'll overcome someday
If in my heart
I do not give
Then I'll overcome someday
I'm going home
I'm going home
I'm going home someday
If in my heart
I do not give
Then I’ll be going home someday
Lord I’ll be going home someday
Matched song to your query 'Life is good and I will survive. I am happy that things turned out this way':
Artist: John Denver
Title: Poems, Prayers, & Promises
Lyrics:I've been lately thinking
About my life's time
All the things I've done
And how it's been
And I can't help believing
In my own mind
I know I'm gonna hate to see it end

I've seen a lot of sunshine
Slept out in the rain
Spent a night or two all on my own
I've known my lady's pleasures
Had myself some friends
A

In [26]:
# Let's look at matched songs for the first query:
for i in indices_q_matched[1]:
    artist = lyrics['Artist'].iloc[i]
    title = lyrics['Title'].iloc[i]
    song_lyrics = lyrics['Lyrics'].iloc[i]
    print(f"Matched song to your query \'{query_2}\':\nArtist: {artist}\nTitle: {title}\nLyrics:{song_lyrics}")

Matched song to your query 'Why did you leave me? I am so sad. The world is so cruel.':
Artist: Billie Eilish
Title: ​bitches broken hearts
Lyrics:You can pretend you don't miss me (Me)
You can pretend you don't care
All you wanna do is kiss me (Me)
Oh, what a shame, I'm not there
You can pretend you don't miss me (Me)
You can pretend you don't care
All you wanna do is kiss me (Me)
Oh , what a shame, I'm not there

What is it you want?
You can lie, but I know that you're not fine
Every time you talk
It's all 'bout me, but you swear I'm not on your mind

You can pretend you don't miss me (Me)
You can pretend you don't care
All you wanna do is kiss me (Me)
Oh, what a shame, I'm not there

Everybody knows
You and I are suicide and stolen art (Art)
Pretty mama sews
Stitches into all your bitches' broken hearts

You can pretend you don't miss me (Me)
You can pretend you don't care
All you wanna do is kiss me (Me)
Oh, what a shame, I'm not there

Somebody new is gonna comfort you
Like you wa

## Step 6 - Save index on disc

## Step 7 - Add results to a system prompt