In [1]:
import faiss
import h5py
import numpy as np
import pandas as pd
from glob import glob
from scipy.spatial.distance import cosine

In [2]:
hdf_path = "../data/tables/20250320/specdiff.h5"
# hdf_path = "/media/nova/Datasets/sageev-midi/20250320/pitch-histogram.h5"

with h5py.File(hdf_path, "r") as f:
    # get filenames and embeddings from the file
    filenames = np.array([str(filename[0], "utf-8") for filename in f["filenames"][:]])
    embeddings = np.array(f["embeddings"][:])

    df = pd.DataFrame(
        {"embeddings": [np.array(e, dtype=np.float32) for e in embeddings]},
        index=filenames,
    )

print(df.head())
first_entry = df["embeddings"].iloc[0]
print(f"Type of first entry in df['embeddings']: {type(first_entry)}")
print(f"Dtype of first entry: {first_entry.dtype}")
print(f"Shape of first entry: {first_entry.shape}")

                                                                         embeddings
20231220-080-01_0000-0005_t00s00  [-0.04725124, -0.121528685, 0.09200393, 0.0497...
20231220-080-01_0000-0005_t00s01  [-0.054494906, -0.13553616, 0.08727736, 0.0638...
20231220-080-01_0000-0005_t00s02  [-0.042115744, -0.1166673, 0.10321771, 0.06491...
20231220-080-01_0000-0005_t00s03  [-0.053316344, -0.13745646, 0.08773589, 0.0467...
20231220-080-01_0000-0005_t00s04  [-0.0399059, -0.108871914, 0.11089589, 0.06425...
Type of first entry in df['embeddings']: <class 'numpy.ndarray'>
Dtype of first entry: float32
Shape of first entry: (768,)


In [16]:
faiss_path = "../data/tables/20250320/specdiff.faiss"
faiss_index = faiss.read_index(faiss_path)
normed_emb = df.iloc[0]["embeddings"] / np.linalg.norm(df.iloc[0]["embeddings"])
print(normed_emb.shape)
print(np.array(faiss_index.reconstruct(0)).shape)
cosine(normed_emb, faiss_index.reconstruct(0))

(768,)
(128,)


ValueError: shapes (768,) and (128,) not aligned: 768 (dim 0) != 128 (dim 0)

In [None]:
print(normed_emb.shape)
res = faiss_index.search(normed_emb, 10)
res

In [None]:

for i, (match, distance) in enumerate(zip(matches[0], distances[0])):
    print(f"Match {i+1}: {df.iloc[match].name}, Distance: {distance:.05f}")

In [None]:
# new_df = df.copy()  # avoid modifying the original DataFrame


# def rename_index(index):
#     if index.endswith(".mid"):
#         return f"{index[:-4]}_t00s00"  # remove '.mid' and append '_t00s00'
#     return index  # if not ending with '.mid', keep it as is


# new_df.index = [rename_index(idx) for idx in new_df.index]
# new_df.head()
# new_df.to_hdf("specdiff.h5", key="dataset")

In [3]:
def normalize_vector(vec):
    norm = np.linalg.norm(vec)
    return vec / norm if norm != 0 else vec


df["normed_embeddings"] = df["embeddings"].apply(normalize_vector)
df.head()

Unnamed: 0,embeddings,normed_embeddings
20240117-064-2b_0044-0052.mid,,
20240117-064-2b_0059-0067.mid,,
20240117-064-2b_0112-0119.mid,,
20240117-064-2b_0142-0149.mid,,
20240117-064-2b_0074-0082.mid,,


In [4]:
index = faiss.IndexFlatIP(768)
embedding_array = np.array(df["normed_embeddings"].tolist(), dtype=np.float32)
index.add(embedding_array)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (11356,) + inhomogeneous part.

In [None]:
k = 5
distances, indices = index.search(
    np.array(
        [df.loc["20240117-064-2b_0044-0052.mid", "normed_embeddings"]],
        dtype=np.float32,
    ),
    k=k,
)

In [12]:
# Print the most similar documents
for i, index in enumerate(indices[0]):
    distance = distances[0][i]
    print(f"Nearest neighbor {i+1}: {df.iloc[index, :].name}, Distance {distance:.05f}")

Nearest neighbor 1: 20240117-064-2b_0044-0052.mid, Distance 1.00000
Nearest neighbor 2: 20240117-064-2b_0037-0044.mid, Distance 0.97732
Nearest neighbor 3: 20240117-064-2b_0014-0022.mid, Distance 0.96528
Nearest neighbor 4: 20240117-064-2b_0007-0014.mid, Distance 0.96513
Nearest neighbor 5: 20240117-064-2b_0067-0074.mid, Distance 0.95847


In [4]:
h5_path = "/media/scratch/sageev-midi/20250320/specdiff.h5"
with h5py.File(h5_path, "r") as f:
    embeddings = np.array(f["embeddings"][:])
    num_files = len(embeddings)

In [6]:
# initialize faiss index
faiss_path = "/media/scratch/sageev-midi/20250320/specdiff.faiss"
index = faiss.IndexFlatIP(768)
# vecs = np.zeros((num_files, 12), dtype=np.float32)


# for i, file in enumerate(all_files):
#     vecs[i] = PrettyMIDI(file).get_pitch_class_histogram(True, True)

print("copying vectors to FAISS index")
index.add(embeddings)  # type: ignore
faiss.write_index(index, faiss_path)
print(f"FAISS index saved to '{faiss_path}'")

copying vectors to FAISS index
FAISS index saved to '/media/scratch/sageev-midi/20250320/specdiff.faiss'
