In [44]:
import pandas as pd 
import numpy as np
import nltk 
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [45]:
# embedding categorical data
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

In [46]:
# timing
from tqdm.auto import tqdm

In [47]:
# vector DB
import os
import kdbai_client as kdbai
from getpass import getpass
import time

In [48]:
pd.set_option("max_colwidth", 1000)

In [49]:
pd.options.mode.chained_assignment = None

## Helper functions


In [50]:
def show_df(df: pd.DataFrame) -> pd.DataFrame:
    print(df.shape)
    return df.head()
    
def show_embeddings(embeddings: np.array) -> list[int]:
    print("Num Embeddings:", len(embeddings))
    print("Embedding Size:", len(embeddings[0]))
    return list(embeddings[0])

## Load songs

In [51]:
df = pd.read_csv('song_data.csv')

In [52]:
show_df(df)

(170653, 19)


Unnamed: 0,id,name,artists,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,release_date,speechiness,tempo,valence,year
0,4BJqT0PrAfrxzMOxytFOIz,"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve","['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']",0.982,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,4,1921,0.0366,80.954,0.0594,1921
1,7xPhfUan2yNtyFG0cUWkt8,Clancy Lowered the Boom,['Dennis Day'],0.732,0.819,180533,0.341,0,0.0,7,0.16,-12.441,1,5,1921,0.415,60.936,0.963,1921
2,1o6I8BglA6ylDMrIELygv1,Gati Bali,['KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat'],0.961,0.328,500062,0.166,0,0.913,3,0.101,-14.85,1,5,1921,0.0339,110.339,0.0394,1921
3,3ftBPsC5vPBKxYSee08FDH,Danny Boy,['Frank Parker'],0.967,0.275,210000,0.309,0,2.8e-05,5,0.381,-9.316,1,3,1921,0.0354,100.109,0.165,1921
4,4d6HGyGT8e121BsdKmw9v6,When Irish Eyes Are Smiling,['Phil Regan'],0.957,0.418,166693,0.193,0,2e-06,3,0.229,-10.096,1,2,1921,0.038,101.665,0.253,1921


## Pre-process data

In [53]:
df.head(2)

Unnamed: 0,id,name,artists,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,release_date,speechiness,tempo,valence,year
0,4BJqT0PrAfrxzMOxytFOIz,"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve","['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']",0.982,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,4,1921,0.0366,80.954,0.0594,1921
1,7xPhfUan2yNtyFG0cUWkt8,Clancy Lowered the Boom,['Dennis Day'],0.732,0.819,180533,0.341,0,0.0,7,0.16,-12.441,1,5,1921,0.415,60.936,0.963,1921


In [54]:
# add "song_" prefix to col names
song_df = df.add_prefix("song_")

In [55]:
song_df.head(1)

Unnamed: 0,song_id,song_name,song_artists,song_acousticness,song_danceability,song_duration_ms,song_energy,song_explicit,song_instrumentalness,song_key,song_liveness,song_loudness,song_mode,song_popularity,song_release_date,song_speechiness,song_tempo,song_valence,song_year
0,4BJqT0PrAfrxzMOxytFOIz,"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve","['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']",0.982,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,4,1921,0.0366,80.954,0.0594,1921


In [56]:
# drop unused cols
song_df = song_df.drop(columns=["song_id", "song_release_date"])

In [57]:
# fix artists list names - remove quotes
def fix_artists(str_list):
    return ", ".join([v for v in str_list.rstrip("']").lstrip("['").split("', '")])


song_df["song_artists"] = song_df["song_artists"].apply(fix_artists)

In [58]:
song_df.head(1)

Unnamed: 0,song_name,song_artists,song_acousticness,song_danceability,song_duration_ms,song_energy,song_explicit,song_instrumentalness,song_key,song_liveness,song_loudness,song_mode,song_popularity,song_speechiness,song_tempo,song_valence,song_year
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve","Sergei Rachmaninoff, James Levine, Berliner Philharmoniker",0.982,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,4,0.0366,80.954,0.0594,1921


In [59]:
# combine song_name & song_artists into song_description (inserts into the first column "0" with a column name song_description)
song_df.insert(
    0, "song_description", song_df["song_name"] + " - " + song_df["song_artists"]
)

In [60]:
song_df.head(1)

Unnamed: 0,song_description,song_name,song_artists,song_acousticness,song_danceability,song_duration_ms,song_energy,song_explicit,song_instrumentalness,song_key,song_liveness,song_loudness,song_mode,song_popularity,song_speechiness,song_tempo,song_valence,song_year
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve - Sergei Rachmaninoff, James Levine, Berliner Philharmoniker","Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve","Sergei Rachmaninoff, James Levine, Berliner Philharmoniker",0.982,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,4,0.0366,80.954,0.0594,1921


In [61]:
# remove duplicate rows
song_data = song_df[
    ~song_df.duplicated(subset=["song_description"], keep="first")
].reset_index(drop=True)

In [62]:
show_df(song_data)

(157685, 18)


Unnamed: 0,song_description,song_name,song_artists,song_acousticness,song_danceability,song_duration_ms,song_energy,song_explicit,song_instrumentalness,song_key,song_liveness,song_loudness,song_mode,song_popularity,song_speechiness,song_tempo,song_valence,song_year
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve - Sergei Rachmaninoff, James Levine, Berliner Philharmoniker","Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve","Sergei Rachmaninoff, James Levine, Berliner Philharmoniker",0.982,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,4,0.0366,80.954,0.0594,1921
1,Clancy Lowered the Boom - Dennis Day,Clancy Lowered the Boom,Dennis Day,0.732,0.819,180533,0.341,0,0.0,7,0.16,-12.441,1,5,0.415,60.936,0.963,1921
2,Gati Bali - KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat,Gati Bali,KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat,0.961,0.328,500062,0.166,0,0.913,3,0.101,-14.85,1,5,0.0339,110.339,0.0394,1921
3,Danny Boy - Frank Parker,Danny Boy,Frank Parker,0.967,0.275,210000,0.309,0,2.8e-05,5,0.381,-9.316,1,3,0.0354,100.109,0.165,1921
4,When Irish Eyes Are Smiling - Phil Regan,When Irish Eyes Are Smiling,Phil Regan,0.957,0.418,166693,0.193,0,2e-06,3,0.229,-10.096,1,2,0.038,101.665,0.253,1921


## Creating vector embeddings using word2vec

In [63]:
# tokenize the descriptions
tokenised_song_descs = [word_tokenize(v.lower()) for v in song_data["song_description"]]

In [65]:
# create embedding model
embedding_dim = 15

word2Vec_model = Word2Vec(
    sentences=tokenised_song_descs,
    vector_size=embedding_dim,
    window=5,
    min_count=1,
    sg=1,
)

* vector_size: The dimensionality of the word vectors. In your code, you've set it to 15, meaning each word in the vocabulary will be represented by a vector of 15 dimensions.

* window: The maximum distance between the current and predicted word within a sentence. It is set to 5, meaning the model will consider up to 5 words before and after the current word in a sentence.

* min_count: Ignores all words with a total frequency lower than this. You've set it to 1, meaning all words, regardless of frequency, will be considered.

* sg: The training algorithm. 1 represents the Skip-gram model, while 0 represents CBOW (Continuous Bag of Words). You've chosen Skip-gram (sg=1).

In [66]:
# function to create embedding vector from tokens
def get_embedding(song_desc_tokens, model, embedding_dim):
    vectors = [model.wv[token] for token in song_desc_tokens if token in model.wv]

    # Average of word vectors OR zeros if no valid tokens found
    return sum(vectors) / len(vectors) if vectors else [0] * embedding_dim

In [67]:
# embed song descriptions as vectors
categorical_embeddings = [
    get_embedding(song_desc_tokens, word2Vec_model, embedding_dim)
    for song_desc_tokens in tokenised_song_descs
]


In [68]:
show_embeddings(categorical_embeddings)


Num Embeddings: 157685
Embedding Size: 15


[-1.2723106,
 1.1312245,
 1.4379098,
 -0.78529704,
 0.31583238,
 0.07468923,
 -1.3993856,
 -0.12268966,
 -0.6210166,
 1.1861943,
 0.8000995,
 0.4178811,
 0.3717347,
 -0.8247968,
 -0.9515019]

## Embed Numeric songs Metadata

In [69]:
# extract numeric columns
numeric_cols = list(
    song_data.drop(columns=["song_name", "song_artists", "song_description"]).columns
)
numeric_cols

['song_acousticness',
 'song_danceability',
 'song_duration_ms',
 'song_energy',
 'song_explicit',
 'song_instrumentalness',
 'song_key',
 'song_liveness',
 'song_loudness',
 'song_mode',
 'song_popularity',
 'song_speechiness',
 'song_tempo',
 'song_valence',
 'song_year']

In [71]:
# scale these columns
scaled_numeric_cols = [
    (song_data[col] - song_data[col].mean()) / np.std(song_data[col])
    for col in numeric_cols
]

In [72]:
#transpose the array to get row embeddings
numeric_embeddings = list(map(list, zip(*scaled_numeric_cols)))


In [74]:

show_embeddings(numeric_embeddings)

Num Embeddings: 157685
Embedding Size: 15


[1.2703070294949106,
 -1.461259048884883,
 4.752569009266444,
 -1.007676175100162,
 -0.3092011481361043,
 2.262496351074803,
 1.3649563314116429,
 2.6110012104955738,
 -1.5078176079821606,
 0.6453499264358126,
 -1.2499471942272533,
 -0.38364744367670833,
 -1.1655450558051375,
 -1.7786347004763523,
 -2.142666230649]

## Merge Categorical & Numeric Embeddings

In [75]:
row_embeddings = [
    np.concatenate([cat_row, num_row])
    for cat_row, num_row in zip(categorical_embeddings, numeric_embeddings)
]

In [76]:
show_embeddings(row_embeddings)

Num Embeddings: 157685
Embedding Size: 30


[-1.2723106145858765,
 1.131224513053894,
 1.4379098415374756,
 -0.7852970361709595,
 0.31583237648010254,
 0.07468923181295395,
 -1.3993855714797974,
 -0.12268965691328049,
 -0.6210166215896606,
 1.1861943006515503,
 0.8000994920730591,
 0.4178811013698578,
 0.37173470854759216,
 -0.8247967958450317,
 -0.9515019059181213,
 1.2703070294949106,
 -1.461259048884883,
 4.752569009266444,
 -1.007676175100162,
 -0.3092011481361043,
 2.262496351074803,
 1.3649563314116429,
 2.6110012104955738,
 -1.5078176079821606,
 0.6453499264358126,
 -1.2499471942272533,
 -0.38364744367670833,
 -1.1655450558051375,
 -1.7786347004763523,
 -2.142666230649]

## Create DataFrame With Embeddings


In [78]:
embedded_song_df = song_data[["song_name", "song_artists", "song_year"]]


In [79]:
embedded_song_df["song_embeddings"] = row_embeddings

In [80]:
show_df(embedded_song_df)

(157685, 4)


Unnamed: 0,song_name,song_artists,song_year,song_embeddings
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve","Sergei Rachmaninoff, James Levine, Berliner Philharmoniker",1921,"[-1.2723106145858765, 1.131224513053894, 1.4379098415374756, -0.7852970361709595, 0.31583237648010254, 0.07468923181295395, -1.3993855714797974, -0.12268965691328049, -0.6210166215896606, 1.1861943006515503, 0.8000994920730591, 0.4178811013698578, 0.37173470854759216, -0.8247967958450317, -0.9515019059181213, 1.2703070294949106, -1.461259048884883, 4.752569009266444, -1.007676175100162, -0.3092011481361043, 2.262496351074803, 1.3649563314116429, 2.6110012104955738, -1.5078176079821606, 0.6453499264358126, -1.2499471942272533, -0.38364744367670833, -1.1655450558051375, -1.7786347004763523, -2.142666230649]"
1,Clancy Lowered the Boom,Dennis Day,1921,"[-0.40999463200569153, 0.6959466338157654, 1.3106187582015991, -0.18623998761177063, 0.5852393507957458, 0.793241024017334, -0.7502458691596985, 0.13912875950336456, -0.18899276852607727, -0.19216479361057281, 0.7288581728935242, 0.45597043633461, 0.506697952747345, -0.6630322337150574, 0.12824217975139618, 0.6055353575765545, 1.600008527700499, -0.3958314054754836, -0.5219481676317469, -0.3092011481361043, -0.5349944602375606, 0.5117210944216953, -0.26640400859153673, -0.1646257835500665, 0.6453499264358126, -1.2044048830284118, 1.872793548131166, -1.8164526201212032, 1.6541851866630062, -2.142666230649]"
2,Gati Bali,KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat,1921,"[-0.5067959427833557, 0.5816471576690674, 0.09128624945878983, 0.3559354841709137, 0.9136020541191101, 0.3209366202354431, 0.13632585108280182, -0.13376779854297638, -0.21252591907978058, 0.24724194407463074, 0.7367221117019653, 0.3116314709186554, 0.28201544284820557, 0.49553152918815613, -1.067232608795166, 1.2144662090537686, -1.183477361379913, 2.1306274127125904, -1.1758127930699978, -0.3092011481361043, 2.374013638541697, -0.6259258882315683, -0.6025761034947833, -0.5873232499193564, 0.6453499264358126, -1.2044048830284118, -0.3997478418740689, -0.21005905433508043, -1.854615663007104, -2.142666230649]"
3,Danny Boy,Frank Parker,1921,"[-0.6843708753585815, 0.8865038156509399, 1.6762135028839111, -0.4014073312282562, 0.3611453175544739, 0.8218395113945007, -0.8597976565361023, 0.18321572244167328, -0.1815173625946045, 0.050726450979709625, 1.0371406078338623, 0.24682621657848358, 0.7631543874740601, -0.8319379091262817, 0.26675304770469666, 1.2304207291798093, -1.4839351050077376, -0.16284109162119192, -0.6415119848547415, -0.3092011481361043, -0.5349062022700511, -0.05710239690493648, 0.9928168892663868, 0.3837053008849818, 0.6453499264358126, -1.295489505426095, -0.39080317620886856, -0.5426988976237884, -1.3774552183139837, -2.142666230649]"
4,When Irish Eyes Are Smiling,Phil Regan,1921,"[-0.38810786604881287, 0.8657161593437195, 0.9275007843971252, -0.19844864308834076, 0.9640052318572998, 0.5460641384124756, -0.9974623322486877, 0.51706862449646, -0.12801410257816315, -0.2778356969356537, 1.3121838569641113, 0.14594393968582153, 0.559045672416687, -0.8403735756874084, 0.12305394560098648, 1.203829862303075, -0.6732660986156829, -0.5052618172494476, -1.0749308222880962, -0.3092011481361043, -0.5349891074077622, -0.6259258882315683, 0.12674640748175162, 0.24684186220999385, 0.6453499264358126, -1.3410318166249366, -0.3752990890558546, -0.4921038246856425, -1.0431389831786766, -2.142666230649]"


## Store Embeddings In KDB.AI

KDB.AI comes in two offerings:

KDB.AI Cloud - For experimenting with smaller generative AI projects with a vector database in our cloud.
KDB.AI Server - For evaluating large scale generative AI applications on-premises or on your own cloud provider.
Depending on which you use there will be different setup steps and connection details required.

Option 1. KDB.AI Cloud
To use KDB.AI Cloud, you will need two session details - a URL endpoint and an API key. To get these you can sign up for free here.

You can connect to a KDB.AI Cloud session using kdbai.Session and passing the session URL endpoint and API key details from your KDB.AI Cloud portal.

If the environment variables KDBAI_ENDPOINTS and KDBAI_API_KEY exist on your system containing your KDB.AI Cloud portal details, these variables will automatically be used to connect. If these do not exist, it will prompt you to enter your KDB.AI Cloud portal session URL endpoint and API key details.

In [82]:
KDBAI_ENDPOINT = (
    os.environ["KDBAI_ENDPOINT"]
    if "KDBAI_ENDPOINT" in os.environ
    else input("KDB.AI endpoint: ")
)
KDBAI_API_KEY = (
    os.environ["KDBAI_API_KEY"]
    if "KDBAI_API_KEY" in os.environ
    else getpass("KDB.AI API key: ")
)

In [83]:
session = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)


## Define Vector DB Table Schema

our table will have four columns:

* Song Name
* Song Artists
* Song Year
* Song Embeddings

* When defining the schema, we must supply the types of these columns. We can use the .dtypes() function on the defined Pandas DataFrame to help with this.

In [85]:
embedded_song_df.dtypes

song_name          object
song_artists       object
song_year           int64
song_embeddings    object
dtype: object

In [86]:
schema = {
    "columns": [
        {
            "name": "song_name",
            "pytype": "str",
        },
        {
            "name": "song_artists",
            "pytype": "str",
        },
        {
            "name": "song_year",
            "pytype": "int64",
        },
        {
            "name": "song_embeddings",
            "vectorIndex": {
                "dims": len(numeric_cols) + embedding_dim,
                "metric": "L2",
                "type": "flat",
            },
        },
    ]
}

## Create vector DB table

In [87]:
# First ensure the table does not already exist
try:
    session.table("songs").drop()
    time.sleep(5)
except kdbai.KDBAIException:
    pass

In [88]:
table = session.create_table("songs", schema)


## Add Embedded Data to KDB.AI Table

* When adding larger amounts of data, you should insert data into an index in chunks.

* It is a good idea to first get an idea of how large your dataset to insert is.

In [89]:
embedded_song_df.memory_usage(deep=True).sum() / (1024**2)


79.63715744018555

* This dataset is 80MB which exceeds the insert limit of <10MB at a time. As such, we'll insert this data in chunks, inserting 10,000 rows at a time.

## Inserting in chunks

In [92]:
chunk_size = 10_000

In [93]:
for i in tqdm(range((len(embedded_song_df) // chunk_size) + 1)):
    index = i * chunk_size
    table.insert(
        embedded_song_df.iloc[index : index + chunk_size].reset_index(drop=True)
    )


  0%|          | 0/16 [00:00<?, ?it/s]

## Verify Data Has Been Inserted

* Running table.query() should show us that data has been added.

In [94]:
show_df(table.query())


(157685, 4)


Unnamed: 0,song_name,song_artists,song_year,song_embeddings
0,"Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve","Sergei Rachmaninoff, James Levine, Berliner Philharmoniker",1921,"[-1.2723106145858765, 1.131224513053894, 1.4379098415374756, -0.7852970361709595, 0.31583237648010254, 0.07468923181295395, -1.3993855714797974, -0.12268965691328049, -0.6210166215896606, 1.1861943006515503, 0.8000994920730591, 0.4178811013698578, 0.37173470854759216, -0.8247967958450317, -0.9515019059181213, 1.2703070294949106, -1.461259048884883, 4.752569009266444, -1.007676175100162, -0.3092011481361043, 2.262496351074803, 1.3649563314116429, 2.6110012104955738, -1.5078176079821606, 0.6453499264358126, -1.2499471942272533, -0.38364744367670833, -1.1655450558051375, -1.7786347004763523, -2.142666230649]"
1,Clancy Lowered the Boom,Dennis Day,1921,"[-0.40999463200569153, 0.6959466338157654, 1.3106187582015991, -0.18623998761177063, 0.5852393507957458, 0.793241024017334, -0.7502458691596985, 0.13912875950336456, -0.18899276852607727, -0.19216479361057281, 0.7288581728935242, 0.45597043633461, 0.506697952747345, -0.6630322337150574, 0.12824217975139618, 0.6055353575765545, 1.600008527700499, -0.3958314054754836, -0.5219481676317469, -0.3092011481361043, -0.5349944602375606, 0.5117210944216953, -0.26640400859153673, -0.1646257835500665, 0.6453499264358126, -1.2044048830284118, 1.872793548131166, -1.8164526201212032, 1.6541851866630062, -2.142666230649]"
2,Gati Bali,KHP Kridhamardawa Karaton Ngayogyakarta Hadiningrat,1921,"[-0.5067959427833557, 0.5816471576690674, 0.09128624945878983, 0.3559354841709137, 0.9136020541191101, 0.3209366202354431, 0.13632585108280182, -0.13376779854297638, -0.21252591907978058, 0.24724194407463074, 0.7367221117019653, 0.3116314709186554, 0.28201544284820557, 0.49553152918815613, -1.067232608795166, 1.2144662090537686, -1.183477361379913, 2.1306274127125904, -1.1758127930699978, -0.3092011481361043, 2.374013638541697, -0.6259258882315683, -0.6025761034947833, -0.5873232499193564, 0.6453499264358126, -1.2044048830284118, -0.3997478418740689, -0.21005905433508043, -1.854615663007104, -2.142666230649]"
3,Danny Boy,Frank Parker,1921,"[-0.6843708753585815, 0.8865038156509399, 1.6762135028839111, -0.4014073312282562, 0.3611453175544739, 0.8218395113945007, -0.8597976565361023, 0.18321572244167328, -0.1815173625946045, 0.050726450979709625, 1.0371406078338623, 0.24682621657848358, 0.7631543874740601, -0.8319379091262817, 0.26675304770469666, 1.2304207291798093, -1.4839351050077376, -0.16284109162119192, -0.6415119848547415, -0.3092011481361043, -0.5349062022700511, -0.05710239690493648, 0.9928168892663868, 0.3837053008849818, 0.6453499264358126, -1.295489505426095, -0.39080317620886856, -0.5426988976237884, -1.3774552183139837, -2.142666230649]"
4,When Irish Eyes Are Smiling,Phil Regan,1921,"[-0.38810786604881287, 0.8657161593437195, 0.9275007843971252, -0.19844864308834076, 0.9640052318572998, 0.5460641384124756, -0.9974623322486877, 0.51706862449646, -0.12801410257816315, -0.2778356969356537, 1.3121838569641113, 0.14594393968582153, 0.559045672416687, -0.8403735756874084, 0.12305394560098648, 1.203829862303075, -0.6732660986156829, -0.5052618172494476, -1.0749308222880962, -0.3092011481361043, -0.5349891074077622, -0.6259258882315683, 0.12674640748175162, 0.24684186220999385, 0.6453499264358126, -1.3410318166249366, -0.3752990890558546, -0.4921038246856425, -1.0431389831786766, -2.142666230649]"


## Find Songs By A Certain Artist

In [95]:
table.query(filter=[("like", "song_artists", "*Calvin Harris*")], sort_by="song_year")

Unnamed: 0,song_name,song_artists,song_year,song_embeddings
0,Flashback,Calvin Harris,2009,"[-0.6285380125045776, 0.8466413021087646, 1.0233451128005981, -0.28528594970703125, 0.7038078308105469, 0.5484970808029175, -0.8259185552597046, -0.11083891987800598, 0.2792535722255707, 0.11158110946416855, 0.672805905342102, 0.5593938827514648, 0.8224308490753174, -0.2334447205066681, 0.4897407591342926, -1.3351724705550174, -1.5406252453148743, -0.011030115749796763, 1.7460279903169285, -0.3092011481361043, -0.5313940449564866, 1.0805445857483271, -0.7039975151774578, 1.0250333372402145, -1.549546934208044, 0.7539144985217788, -0.307319630000332, 0.36420802995835755, -1.2520866301382436, 1.247808968405853]"
1,You Used To Hold Me,Calvin Harris,2009,"[-0.18722771108150482, 1.2554978132247925, 1.2012852430343628, -0.4599727988243103, 0.8321321606636047, 0.5613057017326355, -1.1689554452896118, 0.6158842444419861, 0.013475783169269562, 0.005883281119167805, 1.540282130241394, 0.15665671229362488, 0.7777714729309082, -0.24798770248889923, 0.2923077344894409, -1.279491195315136, 0.46053670752705134, 0.004253770518922344, 1.8244917453695186, -0.3092011481361043, -0.5349904137531296, 1.6493680770749588, 0.6224578016611152, 0.958707209266951, -1.549546934208044, 0.5717452537264123, -0.35562082459241384, 0.3955535635781341, -1.2824790151505443, 1.247808968405853]"
2,I'm Not Alone - Radio Edit,Calvin Harris,2009,"[-0.38968682289123535, 1.0622804164886475, 1.1023608446121216, -0.4288729727268219, 0.7453950643539429, 0.5736275911331177, -1.0266664028167725, 0.6310440301895142, 0.08780206739902496, 0.21935860812664032, 1.2747215032577515, 0.41602006554603577, 0.6876100301742554, -0.5072594881057739, 0.3114408552646637, -1.3273547556932577, 0.3074733286977821, -0.1491385696844665, 0.7932538218211915, -0.3092011481361043, -0.14309084999676247, 0.5117210944216953, 0.4914076629700189, 0.8665875870818629, 0.6453499264358126, 0.3895760089310457, -0.411077751716656, 0.46191895166730823, -0.36310936852844955, 1.247808968405853]"
3,We Found Love,"Rihanna, Calvin Harris",2011,"[-0.395832359790802, 1.1544952392578125, 1.2431914806365967, -0.4664417505264282, 0.6249920725822449, 0.5510679483413696, -1.0721592903137207, 0.26483750343322754, 0.2891601622104645, 0.11555533111095428, 1.21981942653656, 0.41196370124816895, 0.8828952312469482, -0.5569308400154114, 0.22388547658920288, -1.2744389306085564, 1.1181423350898372, -0.12151213480453658, 1.0660087798611477, -0.3092011481361043, -0.5305974929031516, -1.1947493795582, -0.5626912786757541, 1.2313812909348116, 0.6453499264358126, 1.9835569008905032, -0.37351015592281456, 0.36375280436636925, 0.2751307167298641, 1.3248652229298272]"
4,Dance Wiv Me - Radio Edit,"Dizzee Rascal, Calvin Harris, Chrome",2011,"[-0.4619847238063812, 0.8914391398429871, 1.063472867012024, -0.36505261063575745, 0.5351406931877136, 0.492635577917099, -0.6670371294021606, 0.22351916134357452, -0.13763128221035004, 0.0860186442732811, 0.6805768609046936, 0.4071314334869385, 0.47291994094848633, -0.17650116980075836, 0.2073664516210556, -1.2143435714671371, 1.9344803555126058, -0.20954668716662583, 0.9912813940967762, -0.3092011481361043, -0.5349944602375606, 1.6493680770749588, -0.30059100129356187, 1.2671763441267319, 0.6453499264358126, 1.5281337889020867, -0.3329610049072396, -0.1561798539118754, 1.00454795702508, 1.3248652229298272]"
5,Feel So Close - Radio Edit,Calvin Harris,2012,"[-0.5047733187675476, 1.0658838748931885, 1.1152290105819702, -0.47992005944252014, 0.687364935874939, 0.7315778732299805, -1.008978009223938, 0.4123622179031372, -0.07694371789693832, 0.12144271284341812, 1.1705317497253418, 0.36791425943374634, 0.8056957721710205, -0.35605528950691223, 0.46867498755455017, -1.3383314655399736, 0.9650789562605678, -0.19120286091549893, 1.6563551273996828, -0.3092011481361043, -0.5125954164977816, 0.5117210944216953, -0.015699395443352974, 1.5196718418873827, 0.6453499264358126, 2.2112684568847114, -0.4170408621601229, 0.36215951479440944, 1.4870270690953529, 1.363393350191814]"
6,Sweet Nothing (feat. Florence Welch),"Calvin Harris, Florence Welch",2012,"[-0.5463921427726746, 1.0532793998718262, 1.2234747409820557, -0.510463297367096, 0.35872483253479004, 0.49691084027290344, -0.9932032823562622, 0.11034901440143585, 0.1575445830821991, 0.09580708295106888, 1.049813151359558, 0.7195221781730652, 0.6010255217552185, -0.4451219141483307, 0.14200323820114136, -0.8170760203287275, 0.20543107614493594, -0.14259962817167257, 1.6750369738407758, -0.3092011481361043, -0.5346376049176665, 0.7961328400850112, -0.8549900662780685, 1.3266593001662457, -1.549546934208044, 1.8469299672939783, 0.048081752430295104, 0.36206196645326905, 0.20674785045218758, 1.363393350191814]"
7,I Need Your Love (feat. Ellie Goulding),"Calvin Harris, Ellie Goulding",2012,"[-0.43598711490631104, 1.1474380493164062, 1.2179644107818604, -0.5584360361099243, 0.39483460783958435, 0.5178744196891785, -0.9965513944625854, 0.26748108863830566, 0.15887410938739777, 0.3885841965675354, 1.2049258947372437, 0.5520941615104675, 0.6386106610298157, -0.3595333397388458, 0.28296908736228943, -0.2506905558542882, 0.8970507878920038, 0.030931386799656055, 1.4508548165476611, -0.3092011481361043, -0.5349944602375606, 0.7961328400850112, 0.17232906441778495, 1.1294355757166477, 0.6453499264358126, 1.8013876560951365, -0.31387905148814554, 0.2663020115671261, 0.1991497541991124, 1.363393350191814]"
8,Let's Go (feat. Ne-Yo),"Calvin Harris, Ne-Yo",2012,"[-0.5004657506942749, 1.1075575351715088, 1.317096471786499, -0.6495700478553772, 0.14445802569389343, 0.6780844926834106, -0.9535553455352783, 0.2808545231819153, -0.052567798644304276, 0.27093276381492615, 1.186755895614624, 0.6688621044158936, 0.8233105540275574, -0.3870493471622467, 0.23052085936069489, -1.3202549942371695, 0.9820859983527089, 0.01743444222608983, 1.4994276172945025, -0.3092011481361043, -0.5104287949127105, -0.34151414256825235, 0.4971054950870231, 1.5038799066556534, -1.549546934208044, 1.5736761001009283, -0.24709221452131636, 0.36472828777777266, 1.3198689515276991, 1.363393350191814]"
9,Thinking About You (feat. Ayah Marar),"Calvin Harris, Ayah Marar",2012,"[-0.46033337712287903, 0.9587727189064026, 0.9918168187141418, -0.39612647891044617, 0.3049544394016266, 0.45470932126045227, -0.7973266243934631, 0.2169901579618454, 0.061282020062208176, 0.2967797517776489, 0.9496392011642456, 0.4828426241874695, 0.6747055649757385, -0.3045465052127838, 0.18901750445365906, -1.3339492906786878, 1.067121208813414, 0.1370883744063931, 1.4695366629887539, -0.3092011481361043, -0.5336817424536646, -1.479161125221516, -0.6322048305032051, 1.3664900701396077, -1.549546934208044, 1.61921841129977, -0.36575811234630756, 0.3637202882526556, 0.8373898394574263, 1.363393350191814]"


## Find A Specific Song

In [97]:
song = table.query(
    filter=[
        ("like", "song_artists", "*Calvin Harris*"),
        ("like", "song_name", "*We Found Love*"),
    ]
)

In [106]:
song['song_embeddings'].values

array([array([-0.39583236,  1.15449524,  1.24319148, -0.46644175,  0.62499207,
               0.55106795, -1.07215929,  0.2648375 ,  0.28916016,  0.11555533,
               1.21981943,  0.4119637 ,  0.88289523, -0.55693084,  0.22388548,
              -1.27443893,  1.11814234, -0.12151213,  1.06600878, -0.30920115,
              -0.53059749, -1.19474938, -0.56269128,  1.23138129,  0.64534993,
               1.9835569 , -0.37351016,  0.3637528 ,  0.27513072,  1.32486522])],
      dtype=object)

## Find Similar Songs To This Song

* We can then copy and paste the vector associated with this song below and save it as the variable my_vec.

* We will then use KDB.AI's .search() function to find similar songs in the dataset to this song using this vector. We will pull out the 5 songs most similar to this song from the dataset.

In [107]:
my_vec = [-0.39583236,  1.15449524,  1.24319148, -0.46644175,  0.62499207,
               0.55106795, -1.07215929,  0.2648375 ,  0.28916016,  0.11555533,
               1.21981943,  0.4119637 ,  0.88289523, -0.55693084,  0.22388548,
              -1.27443893,  1.11814234, -0.12151213,  1.06600878, -0.30920115,
              -0.53059749, -1.19474938, -0.56269128,  1.23138129,  0.64534993,
               1.9835569 , -0.37351016,  0.3637528 ,  0.27513072,  1.32486522]

In [108]:
table.search(vectors=[my_vec], n=5)[0]

Unnamed: 0,song_name,song_artists,song_year,song_embeddings,__nn_distance
0,We Found Love,"Rihanna, Calvin Harris",2011,"[-0.395832359790802, 1.1544952392578125, 1.2431914806365967, -0.4664417505264282, 0.6249920725822449, 0.5510679483413696, -1.0721592903137207, 0.26483750343322754, 0.2891601622104645, 0.11555533111095428, 1.21981942653656, 0.41196370124816895, 0.8828952312469482, -0.5569308400154114, 0.22388547658920288, -1.2744389306085564, 1.1181423350898372, -0.12151213480453658, 1.0660087798611477, -0.3092011481361043, -0.5305974929031516, -1.1947493795582, -0.5626912786757541, 1.2313812909348116, 0.6453499264358126, 1.9835569008905032, -0.37351015592281456, 0.36375280436636925, 0.2751307167298641, 1.3248652229298272]",5.5511150000000004e-17
1,Bad At Love,Halsey,2017,"[-0.39427173137664795, 0.9801872372627258, 1.4450923204421997, -0.06416860967874527, 0.4925714433193207, 0.5829696655273438, -0.9506996870040894, 0.527362048625946, 0.0296584852039814, 0.06679753214120865, 1.4343153238296509, 0.22768719494342804, 0.5910521745681763, -0.7142414450645447, 0.2921365797519684, -1.1803072618649173, 0.7836705072777309, -0.3899329165171471, 1.0099632405378691, -0.3092011481361043, -0.5349944602375606, -1.479161125221516, -0.6692407392637322, 1.3973720768149898, 0.6453499264358126, 1.9380145896916616, -0.4253892167809766, 0.051533080489714826, 0.32071929424831513, 1.556033986501749]",1.072962
2,Sweet but Psycho,Ava Max,2020,"[-0.3100762665271759, 0.8217790722846985, 1.052780270576477, -0.3912770748138428, 0.8506655097007751, 0.44856008887290955, -0.9023488163948059, 0.4196978807449341, 0.06710892170667648, 0.3673953115940094, 1.010184407234192, 0.37544646859169006, 0.6761094927787781, -0.7477214336395264, 0.12259583920240402, -1.15903456836353, 1.0387761386598457, -0.3412506155567211, 0.841826622568033, -0.3092011481361043, -0.5349944602375606, -1.1947493795582, -0.23221701588951166, 1.1903222593323153, 0.6453499264358126, 2.393437701680078, -0.31984216193161247, 0.5268536307530907, 0.3511116792606158, 1.67161836828771]",1.074219
3,No One Compares To You,Jack & Jack,2019,"[-0.3926072120666504, 0.9080239534378052, 1.2196005582809448, -0.37756675481796265, 0.6168422698974609, 0.36700090765953064, -1.003049612045288, 0.1774161159992218, -0.2515731751918793, 0.07888683676719666, 1.47105073928833, 0.30899298191070557, 0.5672707557678223, -0.5869125723838806, 0.21700964868068695, -1.0032120884658673, 0.9253958580455721, -0.37695782217313883, 0.9987541326732133, -0.3092011481361043, -0.5349944602375606, -1.1947493795582, -0.6418911451021122, 0.9578298795318549, 0.6453499264358126, 1.8924722784928198, -0.3430982926611334, -0.15803327239354273, 0.39290120865252925, 1.6330902410257229]",1.23557
4,Stay Gold,BTS,2020,"[-0.44078055024147034, 0.7353340983390808, 1.1338534355163574, -0.2378760278224945, 0.5972615480422974, 0.6965059041976929, -0.8683232069015503, 0.3174038529396057, -0.29134994745254517, -0.02070758491754532, 1.2180349826812744, 0.30266204476356506, 0.4935379922389984, -0.6740594506263733, 0.288332998752594, -1.1037255652599227, 1.0557831807519866, 0.1001951445409801, 0.5728080338162957, -0.3092011481361043, -0.5349944602375606, -1.1947493795582, -0.7296377597039766, 1.0681979602069414, 0.6453499264358126, 2.2112684568847114, -0.2786966998716909, 0.39727691760494743, 0.20674785045218758, 1.67161836828771]",1.41196


## Automate This Song Similarity Search Process


In [109]:
def find_similar_songs(
    vectorDB_song_tab,
    song_name: str,
    song_artists: list[str] = None,
    song_year: int = None,
    n_similar: int = 5,
    exact: bool = False,
) -> None:
    # create filter list
    filter_list = [("like", "song_name", f"{song_name}" if exact else f"*{song_name}*")]
    if song_artists:
        if type(song_artists) == str:
            song_artists = list(song_artists)
        for artist in song_artists:
            filter_list.append(("like", "song_artists", f"*{artist}*"))
    if song_year:
        filter_list.append(("like", "song_year", f"{song_year}"))

    # find songs liks this in vector DB
    resulting_song = vectorDB_song_tab.query(filter=filter_list, sort_by="song_year")

    # quality check
    if resulting_song.empty:
        print(
            "Song Not Found! Please double check the values entered or try another song"
        )
        return

    # find vectors associated with these songs
    resulting_vectors = [v.tolist() for v in resulting_song["song_embeddings"]]

    # search for similar songs to selected songs
    similar_songs = vectorDB_song_tab.search(vectors=resulting_vectors, n=n_similar + 1)

    # process similar song table
    for i, similar_df in enumerate(similar_songs):
        name = resulting_song.loc[i, "song_name"]
        artists = resulting_song.loc[i, "song_artists"]
        year = resulting_song.loc[i, "song_year"]
        print(f"Songs Similar To '{name}' By '{artists}' ({year})")
        for j, song in similar_df[1:].iterrows():
            print(
                f"   {j}. {song['song_name']} - {song['song_artists']} ({song['song_year']})"
            )
        print()

In [110]:
find_similar_songs(table, song_name="Let's Go", song_artists=["Calvin Harris", "Ne-Yo"])

Songs Similar To 'Let's Go (feat. Ne-Yo)' By 'Calvin Harris, Ne-Yo' (2012)
   1. I Cry - Flo Rida (2012)
   2. Mmm Yeah (feat. Pitbull) - Austin Mahone, Pitbull (2014)
   3. Too Much (feat. Usher) - Marshmello, Imanbek, Usher (2020)
   4. All Around The World - Justin Bieber, Ludacris (2012)
   5. No Money - Galantis (2016)



In [111]:
find_similar_songs(
    table,
    song_name="Californication",
    song_artists="Red Hot Chili Peppers",
    n_similar=8,
)

Songs Similar To 'Californication' By 'Red Hot Chili Peppers' (1999)
   1. Police Station - Red Hot Chili Peppers (2011)
   2. Charlie - Red Hot Chili Peppers (2006)
   3. Especially in Michigan - Red Hot Chili Peppers (2006)
   4. Dark Necessities - Red Hot Chili Peppers (2016)
   5. Don't Forget Me - Red Hot Chili Peppers (2002)
   6. Cabron - Red Hot Chili Peppers (2002)
   7. Face Down - The Red Jumpsuit Apparatus (2006)
   8. Look Around - Red Hot Chili Peppers (2011)



## All artist with a given name song name

In [112]:
find_similar_songs(table, song_name="Love Me", exact=True)


Songs Similar To 'Love Me' By 'Elvis Presley' (1956)
   1. Without Him - Elvis Presley (1967)
   2. Don't - Elvis Presley (1959)
   3. Fine And Mellow - Billie Holiday (1957)
   4. Everything I Have Is Yours - 10'' Version - Billie Holiday (1956)
   5. Harbor Lights - Elvis Presley (1959)

Songs Similar To 'Love Me' By 'Buddy Holly' (1958)
   1. A Love That's Worth Having - Willie Hutch (1969)
   2. Midnight Shift - Buddy Holly (1958)
   3. Johnny be good - Radio Version - Jonny Bombastic (1955)
   4. Lonely Weekends - Wanda Jackson (1961)
   5. Rock & Roll Guitar - Johnny Knight (1959)

Songs Similar To 'Love Me' By 'Sarah Vaughan' (1958)
   1. I'm Just A Lucky So And So - Ella Fitzgerald (1957)
   2. Make the World Go Away - Ray Price (1956)
   3. Summer Is Gone - Carmen McRae (1956)
   4. A Pretty Girl Is Like A Melody - Irving Berlin, Ethel Merman, Dan Dailey (1954)
   5. Your Love Has Faded - Johnny Hodges (1961)

Songs Similar To 'Love Me' By 'Bo Diddley' (1960)
   1. Keepin' Out

## Delete KDB.AI table

In [113]:
table.drop()

True