In [1]:
import pickle
import faiss 
import torch
import numpy as np
import math
from asif import extract_candidate_sets_from_clusters
import random
from tqdm import tqdm

In [2]:
data_folder = "data"

print("Loading dataset...")
lyrics = pickle.load(open(f"{data_folder}/lyrics.pkl", "rb"))
chords = pickle.load(open(f"{data_folder}/chords.pkl", "rb"))
artist_song = pickle.load(open(f"{data_folder}/artist_song.pkl", "rb"))

print("Loading precomputed embeddings...")
chords_embeddings = pickle.load(open(f"{data_folder}/chords_embeddings_sbert_chocolm.pkl", "rb"))
lyrics_embeddings = pickle.load(open(f"{data_folder}/lyrics_embeddings_sbert_roberta.pkl", "rb"))

kmeans_lyrics = pickle.load(open(f"{data_folder}/lyrics_kmeans.pkl", "rb"))
kmeans_chords = pickle.load(open(f"{data_folder}/chords_kmeans.pkl", "rb"))


Loading dataset...
Loading precomputed embeddings...


In [None]:
# clustering
# find candidates by sampling each cluster
# proceeds with asif

def compute_relative_coordinates(candidate_embeddings, embeddings, denoise_and_normalize=True, k=800, p=8):
        
        sim = (1 / (1 + torch.cdist(candidate_embeddings, embeddings)))

        if not denoise_and_normalize:
              return sim
        
        result = torch.zeros(sim.size())
        
        for i, j in enumerate(torch.argsort(sim, descending=True)[:,:k]):
            result[i][j] = p

        return torch.nn.functional.normalize(result)

In [4]:
relative_coordinates_chord_clusters = compute_relative_coordinates(torch.from_numpy(kmeans_chords.cluster_centers_), chords_embeddings)
relative_coordinates_lyrics_clusters = compute_relative_coordinates(torch.from_numpy(kmeans_lyrics.cluster_centers_), lyrics_embeddings)

In [5]:
def compute_self_relative_coordinates(embeddings, anchors, batch_size=1_000, device="cpu"):
    
    result = []

    for i in tqdm(range(0, embeddings.size()[0], batch_size)):
        self_relative_coordinates_batch = compute_relative_coordinates(embeddings[i:min(i+batch_size, embeddings.size()[0])], embeddings)
        self_relative_coordinates_batch = self_relative_coordinates_batch.to(device)
        relative_coordinates_vs_anchors = compute_relative_coordinates(self_relative_coordinates_batch, anchors, denoise_and_normalize=False)
        relative_coordinates_vs_anchors = relative_coordinates_vs_anchors.to("cpu")
        result.append(relative_coordinates_vs_anchors)
    
    return torch.vstack(result)

In [6]:
anchors = torch.vstack([relative_coordinates_chord_clusters, relative_coordinates_lyrics_clusters])
anchors = anchors.to("cuda:0")
chords_embeddings = chords_embeddings.to("cuda:0")
relative_coordinates = compute_self_relative_coordinates(chords_embeddings, anchors, batch_size=100, device="cuda:0")

  6%|▋         | 2206/35173 [18:45<4:40:21,  1.96it/s]


KeyboardInterrupt: 

In [14]:
lyrics_candidates_ids = extract_candidate_sets_from_clusters(kmeans_lyrics.n_clusters, kmeans_lyrics.labels_, lyrics, retrieve_ids=True)
chords_candidates_ids = extract_candidate_sets_from_clusters(kmeans_chords.n_clusters, kmeans_chords.labels_, lyrics, retrieve_ids=True)

In [31]:
n_of_candidates = 500_000
n_of_lyrics_per_cluster = math.ceil(n_of_candidates / kmeans_lyrics.n_clusters)
n_of_chords_per_cluster = math.ceil(n_of_candidates / kmeans_chords.n_clusters)

lyrics_candidates_ids_sampled = {cluster_id: random.sample(cluster, min(n_of_lyrics_per_cluster, len(cluster))) for cluster_id, cluster in lyrics_candidates_ids.items()}
chords_candidates_ids_sampled = {cluster_id: random.sample(cluster, min(n_of_chords_per_cluster, len(cluster))) for cluster_id, cluster in chords_candidates_ids.items()}

sampled_chords = [chords for cluster in chords_candidates_ids_sampled.values() for chords in cluster]
sampled_lyrics = [lyrics for cluster in lyrics_candidates_ids_sampled.values() for lyrics in cluster]

print(len(sampled_chords), len(sampled_lyrics))

print(len(set(sampled_chords) & set(sampled_lyrics)))

anchors_ids = sorted(list(set(sampled_chords) & set(sampled_lyrics)))

500032 486031
103253


In [34]:
anchors_chord_embeddings = chords_embeddings[anchors_ids]
anchors_lyrics_embeddings = lyrics_embeddings[anchors_ids]

In [3]:
res = faiss.StandardGpuResources() 

In [4]:
lyrics_index = faiss.IndexFlatL2(kmeans_lyrics.cluster_centers_.shape[1])   # build the index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, lyrics_index)

print(gpu_index_flat.is_trained)
gpu_index_flat.add(kmeans_lyrics.cluster_centers_)                  # add vectors to the index
print(gpu_index_flat.ntotal)

True
64


In [6]:
chunk_size = 1_000_000
for i in range(0, len(lyrics_embeddings), chunk_size):
    print(i, min(i+chunk_size, len(lyrics_embeddings)))

0 1000000
1000000 2000000
2000000 3000000
3000000 3517221


In [None]:
print("Initialising asif...")
asif = ASIF(
    lyrics_candidates,
    chords_candidates,
    torch.from_numpy(kmeans_lyrics.cluster_centers_),
    torch.from_numpy(kmeans_chords.cluster_centers_),
    lyrics_embeddings,
    chords_embeddings
)

In [8]:
lyrics_embeddings.size()

torch.Size([3517221, 768])

In [12]:
#(1 / (1 + torch.cdist(candidate_embeddings, embeddings)))
D = 1 / (1 + torch.cdist(lyrics_embeddings, torch.from_numpy(kmeans_lyrics.cluster_centers_)))

In [13]:
D.size()
D[0]

tensor([0.4046, 0.4699, 0.3783, 0.4046, 0.4915, 0.3568, 0.3041, 0.4217, 0.4260,
        0.2637, 0.4239, 0.4301, 0.4801, 0.3477, 0.4009, 0.4156, 0.3465, 0.5012,
        0.5070, 0.4362, 0.4472, 0.4021, 0.3839, 0.4087, 0.3904, 0.3391, 0.3890,
        0.2890, 0.4971, 0.4680, 0.4450, 0.3502, 0.4416, 0.2477, 0.3754, 0.4043,
        0.3593, 0.3681, 0.3934, 0.3286, 0.4110, 0.4669, 0.3430, 0.3555, 0.4246,
        0.4473, 0.4002, 0.5011, 0.4886, 0.3456, 0.3676, 0.3959, 0.1414, 0.3203,
        0.3993, 0.3474, 0.3074, 0.4624, 0.4558, 0.3747, 0.4049, 0.4857, 0.4738,
        0.2195])

In [6]:
k = 800                          # we want to see 4 nearest neighbors
D, I = gpu_index_flat.search(lyrics_embeddings.numpy()[:1000], gpu_index_flat.ntotal) # sanity check
print(I)

[[18 17 47 ... 33 63 52]
 [ 4 18 28 ... 33 63 52]
 [ 4 18 61 ... 33 63 52]
 ...
 [58 48 12 ... 33 63 52]
 [58 44 19 ... 33 63 52]
 [58 48 12 ... 33 63 52]]


In [13]:
len(set(lyrics))

2376192

In [15]:
torch.zeros((2376192, 2376192))

RuntimeError: [enforce fail at alloc_cpu.cpp:118] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 22585153683456 bytes. Error code 12 (Cannot allocate memory)

In [12]:
import torch
p = 8
result = torch.zeros((lyrics_embeddings.size()[0], lyrics_embeddings.size()[0]))
for i, row in enumerate(I):
    for j in row:
        result[i][j] = p

RuntimeError: [enforce fail at alloc_cpu.cpp:118] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 49483374251364 bytes. Error code 12 (Cannot allocate memory)

In [None]:
def get_relative_coordinates(vectors, db, k):
    D, I = db.search(vectors, k) # sanity check
print(I)

In [23]:
I[0]

array([      0, 2766256,  244114, 3333898, 1985832, 1090844, 3020608,
       2906870,  212403, 2111220, 1755655, 1755687, 2647155, 1856240,
       2556382, 2556394, 1856225, 1856237, 2668070, 1578963, 2668058,
       1856252, 1578974, 2001890, 1626605, 1613684, 1969846, 1317591,
       1317604, 1317614, 2592626,  913615, 1923960, 3210497, 2693435,
       2111237,   24932, 2551017, 2551032, 2550995,  484609,  511430,
       3016549, 3016557, 1624156, 2209990, 1948638, 2341452,  695423,
       2022732, 2022736, 3277044,    1532, 3293959, 1610307, 1082582,
       2710271, 2710281, 2710291,  618348, 1850392, 2111227,  414581,
       2533395, 2111231,  667300, 3050540, 3050561, 2509091, 2509106,
        736855,  326224,  326264,  326179,  842336, 3459455,  433031,
       1337115,   48644,   48633, 1071513, 2996072, 2094142, 2094123,
       2094158, 2111239, 3319364, 3319352, 3319326,  559638,      31,
        881803,  881775, 1266077, 1266090, 3323586,  571326, 2804722,
        793786,  881