In [None]:
!pip install datasets faiss-cpu beir

In [None]:
from datasets import load_dataset
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval.evaluation import EvaluateRetrieval
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertConfig
from collections import defaultdict
import faiss
import torch
import random
import numpy as np
import pandas as pd

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/cs566

In [None]:
# Load the embeddings from the JSONL file
embeddings_df = pd.read_json('../data/embeddings.jsonl', orient='records', lines=True)

In [None]:
def normalize_embedding(x):
    # Convert to numpy array if it's not already
    arr = np.array(x)

    # If it's 3D with shape (1, 64, 322), take mean across last dimension
    if len(arr.shape) == 3:
        arr = np.mean(arr, axis=2).squeeze()

    # Normalize the vector
    norm = np.linalg.norm(arr)
    if norm > 0:  # Avoid division by zero
        return arr / norm
    return arr

In [None]:
import pandas as pd
import numpy as np

# Convert embedding columns from lists to numpy arrays
embedding_columns = ["audio", "q_audio", "q_audio_eq", "q_audio_pitch", "q_audio_back"]

for col in embedding_columns:
    # Check if the column exists in the DataFrame
    if col in embeddings_df.columns:
        # Convert to numpy array, take mean across last dimension if needed, and normalize
        embeddings_df[col] = embeddings_df[col].apply(
            lambda x: normalize_embedding(x) if x is not None else None
        )

In [None]:
for col in embedding_columns:
    shape = embeddings_df.iloc[0][col].shape
    print(col, shape)

embeddings_df.head()

## Set Dataset Splits

In [None]:
# Determine train/test split indices
train_test_split=0.7
random.seed(42)
dataset_size = len(embeddings_df)
train_size = int(dataset_size * train_test_split)
indices = list(range(dataset_size))
random.shuffle(indices)
train_indices = set(indices[:train_size])
test_indices = set(indices[train_size:])

# extract rows for test_indices from dataframe
test_df = embeddings_df[embeddings_df.index.isin(test_indices)]

## Perform Retrieval

In [None]:
import pandas as pd

# Load the TSV file with the qrels data
def load_qrels_from_tsv(file_path):
    """
    Load qrels from a TSV file into a dictionary format required by EvaluateRetrieval.

    Args:
        file_path (str): Path to the TSV file containing qrels data

    Returns:
        dict: A nested dictionary of {query_id: {doc_id: relevance_score}}
    """
    # Read the TSV file
    # Assuming format: query_id, 0, doc_id, relevance_score
    # The second column (0) is typically an iteration which we can ignore
    df = pd.read_csv(file_path, sep='\t', header=None,
                     names=['query_id', 'iteration', 'doc_id', 'relevance'])

    # Convert to the required dictionary format
    qrels_dict = {}
    for _, row in df.iterrows():
        query_id = str(row['query_id'])
        doc_id = str(row['doc_id'])
        relevance = int(row['relevance'])

        # Initialize the inner dictionary if needed
        if query_id not in qrels_dict:
            qrels_dict[query_id] = {}

        # Add the document relevance
        qrels_dict[query_id][doc_id] = relevance

    return qrels_dict

def format_retrievals_faiss(qids, retrieved_pids, scores):
    """
    Format FAISS search results for BEIR evaluation

    Parameters:
    -----------
    qids : list or Series
        List of query IDs
    retrieved_pids : list of lists
        List of lists containing retrieved document IDs for each query
    scores : numpy.ndarray
        Matrix of similarity scores from FAISS search

    Returns:
    --------
    dict
        Dictionary mapping query IDs to {doc_id: score} dictionaries
    """
    retrievals = {}

    # Convert qids to list if it's a pandas Series
    if hasattr(qids, 'tolist'):
        qids = qids.tolist()

    for i, qid in enumerate(qids):
        # Make sure qid is a string
        qid_str = str(qid)
        retrievals[qid_str] = {}

        for j, pid in enumerate(retrieved_pids[i]):
            # Make sure pid is a string
            pid_str = str(pid)
            # Convert numpy float to Python float
            score = float(scores[i][j])
            retrievals[qid_str][pid_str] = score

    # Validate structure
    if len(retrievals) == 0:
        print("Warning: Empty retrievals dictionary")
    else:
        sample_qid = next(iter(retrievals))
        if len(retrievals[sample_qid]) == 0:
            print(f"Warning: No documents for query {sample_qid}")

    return retrievals

In [None]:
# download qrels
qrels_file_path = "data/qrels.tsv"
qrels = load_qrels_from_tsv(qrels_file_path)

# extract query embeddings
doc_embeddings = np.vstack(embeddings_df["audio"])

# get unique result for each query
query_columns = ["q_audio", "q_audio_eq", "q_audio_pitch", "q_audio_back"]
for query in query_columns:
    query_embeddings = np.vstack(test_df[query])

    # stack embeddings
    k = 10
    d = embeddings_df.iloc[0]['audio'].shape[0]
    index = faiss.IndexFlatIP(d)
    index.add(doc_embeddings)
    D, I = index.search(query_embeddings, k)

    # extract qids and pids
    qids = test_df["qid"]
    pids = embeddings_df["pid"]
    retrieved_pids = [[pids[idx] for idx in row] for row in I]
    retrievals = format_retrievals_faiss(qids, retrieved_pids, D)

    # obtain retrievals
    k_values = [1, 3, 5, 10]
    ndcg, map, recall, precision = EvaluateRetrieval.evaluate(qrels, retrievals, k_values)
    print(f"\nResults for {query}:")
    print(f"NDCG: {ndcg}")
    print(f"MAP: {map}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")