In [1]:
import os
import time
import numpy as np
import pandas as pd
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, connections, utility
from getpass import getpass
import openai

### Spotify songs data from Kaggle

That data is available for download at - https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset

The dataset contains information on ~114000 songs.  

In [2]:
## read the dataset and filter out the un-necessary columns
songs_dataset = pd.read_csv('../../songs.zip', compression='zip', header=0)
songs_dataset = songs_dataset.loc[:, ~songs_dataset.columns.str.contains('^Unnamed')]
songs_dataset.shape

(114000, 20)

In [5]:
print(openai.version)

<module 'openai.version' from '/home/maverick/miniconda3/envs/milvus-env/lib/python3.9/site-packages/openai/version.py'>


In [6]:
## filter the data for linkin park, choose only album name, track name and popularity
## columns for further processing

lp_dataset = songs_dataset[songs_dataset['artists'].str.contains("Linkin Park", na=False)]
lp_dataset = lp_dataset[['album_name', 'track_name', 'popularity']]
lp_dataset = lp_dataset.drop_duplicates(subset = ['album_name', 'track_name'])
lp_dataset = lp_dataset.sort_values('popularity', ascending=False)

print(lp_dataset.head())
print(lp_dataset.shape)

                         album_name       track_name  popularity
3006  Hybrid Theory (Bonus Edition)       In the End          85
3007                        Meteora             Numb          83
3262  Hybrid Theory (Bonus Edition)  One Step Closer          78
3263                        Meteora            Faint          77
3311  Hybrid Theory (Bonus Edition)         Crawling          76
(154, 3)


In [13]:
# Configs

COLLECTION_NAME = 'linkin_park' # collection name in milvus
DIMENSION = 1536 # embeddings size, depends on the embeddings model
COUNT = 200 # number of titles to embed and insert
OPENAI_ENGINE = 'text-embedding-3-small' # model to use
openai.api_key = getpass('OpenAI API Key: ')
print(openai.api_version)

OpenAI API Key:  ········


None


In [8]:
# connect to the database 

connections.connect(
    alias='default',
    host='localhost',
    port='19530'
)

In [9]:
# remove the collection is it already exists 

if(utility.has_collection(COLLECTION_NAME)):
    utility.drop_collection(COLLECTION_NAME)

# Create the collection
track_name_field = FieldSchema(name='track_name', dtype=DataType.VARCHAR, max_length=1024, is_primary=True, auto_id=False)
album_name_field = FieldSchema(name='album_name', dtype=DataType.VARCHAR, max_length=1024)
popularity_field = FieldSchema(name='popularity', dtype=DataType.INT64)
embedding_field = FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='Embedding vectors', dim=DIMENSION)

fields = [track_name_field, album_name_field, popularity_field, embedding_field]
schema = CollectionSchema(fields=fields, description='Linkin Park Songs Collection')
collection = Collection(name=COLLECTION_NAME, schema=schema)

# create an index for the collection
index_params = {
    'index_type': 'IVF_FLAT',
    'metric_type': 'L2',
    'params': {'nlist': 64}
}

collection.create_index(field_name='embedding', index_params=index_params)

Status(code=0, message=)

In [15]:
# get the embeddings

def embed(text):
    return openai.Embedding.create(
        input=text,
        engine=OPENAI_ENGINE)["data"][0]["embedding"]

In [14]:
# get embedding

count = 0
tracks = []
albums = []
popularity = []
embds = []

for index, row in lp_dataset.iterrows():
    tracks.append(row['track_name'])
    albums.append(row['album_name'])
    popularity.append(row['popularity'])
    embds.append(embed(row['track_name']))
    time.sleep(1)
    count += 1
    if count%10==0 and count>0:
        print(f'Embedding generated for {count} song titles')
    if count==COUNT:
        break

data_to_insert = [tracks, albums, popularity, embds]

None
None
None
None
None
None
None
None
None
None
Embedding geneated for 10 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 20 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 30 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 40 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 50 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 60 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 70 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 80 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 90 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 100 song titles
None
None
None
None
None
None
None
None
None
None
Embedding geneated for 110 song titles
None
None
None
None
None
None


In [18]:
collection.insert(data_to_insert)

(insert count: 154, delete count: 0, upsert count: 0, timestamp: 448591266272247811, success count: 154, err count: 0)

In [27]:
# load the collection in memory
collection.load()

# search the DB for input
def search(text):
    search_params={"metric_type": "L2"}
    results = collection.search(
        data=[embed(text)],
        anns_field="embedding",
        param=search_params,
        limit=10,
        output_fields=['album_name', 'popularity']
    )
    return results

search_terms=['blade']

In [20]:
print(len(data_to_insert))

4


In [28]:
results = search(search_terms)

for indx, result in enumerate(results[0]):
    print('='*20)
    print('Name: ', result.id)
    print('Album: ', result.entity.get('album_name'))
    print('Popularity: ', result.entity.get('popularity'))

Name:  Papercut
Album:  Hybrid Theory (20th Anniversary Edition)
Popularity:  53
Name:  Sharp Edges
Album:  One More Light
Popularity:  57
Name:  Bleed It Out
Album:  Minutes to Midnight
Popularity:  58
Name:  SKIN TO BONE
Album:  LIVING THINGS
Popularity:  52
Name:  CASTLE OF GLASS
Album:  LIVING THINGS
Popularity:  73
Name:  The Catalyst
Album:  A Thousand Suns
Popularity:  36
Name:  POWERLESS
Album:  LIVING THINGS
Popularity:  60


In [29]:
collection.release()