<a href="https://colab.research.google.com/github/nixtasy/music-search/blob/main/Payload_and_vector_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Additional dataset which contains rich metadata of songs, incl. emotion features
# Could be potentially used for distant supervision
# !wget https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/3t9vbwxgr5-3.zip
# !unzip 3t9vbwxgr5-3.zip

In [None]:
# We use SentenceTransformer pre-trained models to convert our text into vectors.
!pip install sentence-transformers
!pip install qdrant-client>=1.1.1
!pip install sentence-transformers numpy

In [None]:
import pandas as pd
import os.path as osp
import json
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.notebook import tqdm

In [None]:
src = "dir-to-dataset-Vagalume"
LYRICS = pd.read_csv(osp.join(src,"lyrics-data.csv"))
ARTISTS = pd.read_csv(osp.join(src,"artists-data.csv"))
VECTORS = np.load(osp.join(src,"lyrics-vectors-all-MiniLM-L12-v2.npy"))
# LYRICS.columns : 'ALink', 'SName', 'SLink', 'Lyric', 'language'
# ARTISTS.columns : 'Artist', 'Genres', 'Songs', 'Popularity', 'Link'

In [None]:
# Query all tracks of an artist if in the dataset
# a_name = "Lady Gaga"
# if a_name in set(ARTISTS['Artist']):
#   print(LYRICS.loc[LYRICS.SName == "Bad Romance"].head())

In [None]:
# Langs in the dataset:
# 'ny', 'sw', 'fi', 'ht', 'gd', 'sl', 'pt', 'sv', 'zh', 'hu', nan, 'lv', 'mg', 'tl', 'vi', 'de', 'en', 'ko', 'et', 'id', 'ku', 'is', 'lg', 'cy', 
# 'fr', 'ga', 'ms', 'rw', 'hmn', 'ru', 'no', 'sq', 'ro', 'cs', 'hr', 'es', 'it', 'iw', 'eu', 'jw', 'su', 'nl', 'ar', 'fa', 'af', 'gl', 'da', 'sr', 'pl', 'ca', 'st', 'tr', 'ja'
# print(list(set(LYRICS['language'])))
# lang_list = list(set(LYRICS['language']))
LYRICS['language'].value_counts()[:20].plot(kind='barh')

In [None]:
# Creating a english subset of 191814 instances
EN_LYRICS = LYRICS.loc[LYRICS.language == 'en',['ALink', 'SName', 'SLink', 'Lyric']]

In [None]:
EN_LYRICS.head()

In [None]:
en_subset = pd.merge(EN_LYRICS, ARTISTS, how="left", left_on="ALink", right_on="Link").drop(columns=['Genres',	'Songs',	'Popularity',	'Link']).reset_index(drop=True)

In [None]:
with open('lyrics_payloads.json', 'w') as f:
  f.write(en_subset.to_json(orient="records", lines=True))

In [None]:
df = pd.read_json('./lyrics_payloads.json', lines=True)

In [None]:
# Full list of available models could be found here https://www.sbert.net/docs/pretrained_models.html
# model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens', device="cuda")
model = SentenceTransformer('all-MiniLM-L12-v2', device="cuda")

In [None]:
# Here we encode all lyrics
# We do encoding in batches, as this reduces overhead costs and significantly speeds up the process
# vectors = []
# batch_size = 64
# batch = []
# for row in tqdm(EN_LYRICS.itertuples()):
#   content = str(row.SName) + "\n" + str(row.Lyric)
#   batch.append(content)
#   if len(batch) >= batch_size:
#     vectors.append(model.encode(batch))  # Text -> vector encoding happens here
#     batch = []

# if len(batch) > 0:
#   vectors.append(model.encode(batch))
#   batch = []

# vectors = np.concatenate(vectors)

0it [00:00, ?it/s]

In [None]:
# Now we have all our descriptions converted into vectors.
# We have 191814 vectors of 384 dimentions. The output layer of the model has this dimension
# vectors.shape

(191814, 384)

In [None]:
# You can download this saved vectors and continue with rest part of the tutorial.
# np.save('vectors2.npy', vectors, allow_pickle=False)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df.iloc[12345]

ALink                                        /eric-clapton/
SName                                   Hoochie Coochie Man
SLink                /eric-clapton/hoochie-coochie-man.html
Lyric     Gypsy woman told my mother 'fore I was born\nY...
Artist                                         Eric Clapton
Name: 12345, dtype: object

In [None]:
# Take a random description as a query
sample_query = df.iloc[12345].Lyric
print(sample_query.split('\n')[0])
# sample_query = "i feel gloomy coz the weather is like shit"

Gypsy woman told my mother 'fore I was born


In [None]:
query_vector = model.encode(sample_query)  # Convert query description into a vector.

In [None]:
scores = cosine_similarity([query_vector], VECTORS)[0]  # Look for the most similar vectors, manually score all vectors
top_scores_ids = np.argsort(scores)[-5:][::-1]  # Select top-5 with vectors the largest scores

In [None]:
# Check if result similar to the query
for top_id in top_scores_ids:
  print("####"+df.iloc[top_id].SName+"####")
  print(df.iloc[top_id].Lyric)
  print("-----")

In [None]:
# test on local machine
from qdrant_client import models, QdrantClient
from qdrant_client.models import VectorParams, Distance
# qdrant_client = QdrantClient(host='localhost', port=6333)
qdrant_client = QdrantClient(":memory:")

In [None]:
qdrant_client.recreate_collection(
    collection_name='lyrics', 
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

True

In [None]:
import numpy as np
import json

fd = open('./lyrics_payloads.json')

# payload is now an iterator over startup data
payload = map(json.loads, fd)

# Here we load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if we don't want to load all data into RAM
vectors = VECTORS

In [None]:
qdrant_client.upload_collection(
    collection_name='lyrics',
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256  # How many vectors will be uploaded in a single request?
)

In [None]:
hits = qdrant_client.search(
    collection_name="lyrics",
    query_vector=model.encode("Aliens attack our planet").tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'ALink': '/misfits/', 'SName': 'Mars Attacks', 'SLink': '/misfits/mars-attacks.html', 'Lyric': 'Their eyes for many centuries\nPeered in from space\nSincere the hope, their wise believed\nThey could teach our race\n\nBut yet on Mars, a darker side\nLike all things that God made\nTheir tribe of war, would heed them not\n"Earth they must invade"\n\nSee: the fire in the skies\nSee: them devastate the land\nMars attacks: the warlord chief commands\n\nSee: the humans fight and die\nSee: our planet laid to waste\nMars attacks: monsters invade the earth from space\n\nIt was then, in our darkest hour\nWhen everything seemed lost\nThe hearts of men would not concede\nNo mctter what the cost\n\nThey forged a sword of sound and steel\nUpon the Martian doors\nThe voice of war would thunder there\nAnd "Mars would be no more"\n\nSee: the Martian cities fall\nSee: the death of the warrior tribe\nMars attacks, now their planet won\'t survive\n\nSee: their world turn into ash\nSee: the terror on their