In [29]:
import faiss
import numpy as np
import pickle

class SingleIndexManager:
    def __init__(self, embedding_dim):
        self.embedding_dim = embedding_dim
        self.index = faiss.IndexFlatL2(embedding_dim)
        self.index_to_doc_id = {}

    def add_documents(self, embeddings, doc_ids):
        if len(embeddings) != len(doc_ids):
            raise ValueError("Number of embeddings and document IDs must be the same.")
        
        # Add embeddings to FAISS index
        self.index.add(embeddings)
        
        # Map the index positions to document IDs
        start_pos = len(self.index_to_doc_id)
        for i, doc_id in enumerate(doc_ids):
            self.index_to_doc_id[start_pos + i] = doc_id

    def search(self, query_embedding, top_k=10):
        # Perform the search
        distances, indexes = self.index.search(np.array([query_embedding]), top_k)
        
        # Filter out invalid indices
        valid_indices = [i for i in indexes[0] if i >= 0]
        
        # Retrieve document IDs from the mapping
        retrieved_doc_ids = [self.index_to_doc_id[i] for i in valid_indices]

        # zip the distances and retrieved_doc_ids
        return list(zip(distances[0, :len(valid_indices)], retrieved_doc_ids))
        
    def save_index(self, index_filepath, mapping_filepath):
        faiss.write_index(self.index, index_filepath)
        with open(mapping_filepath, "wb") as f:
            pickle.dump(self.index_to_doc_id, f)

    def load_index(self, index_filepath, mapping_filepath):
        self.index = faiss.read_index(index_filepath)
        with open(mapping_filepath, "rb") as f:
            self.index_to_doc_id = pickle.load(f)

# Example usage
if __name__ == "__main__":
    embedding_dim = 128
    
    # Create instances for different indexes
    manager1 = SingleIndexManager(embedding_dim)
    manager2 = SingleIndexManager(embedding_dim)
    manager3 = SingleIndexManager(embedding_dim)
    
    # Example document embeddings and IDs for manager1
    doc_ids_1 = ["doc1", "doc2", "doc3"]
    embeddings_1 = np.random.random((len(doc_ids_1), embedding_dim)).astype('float32')
    manager1.add_documents(embeddings_1, doc_ids_1)
    
    # Example document embeddings and IDs for manager2
    doc_ids_2 = ["doc4", "doc5", "doc6"]
    embeddings_2 = np.random.random((len(doc_ids_2), embedding_dim)).astype('float32')
    manager2.add_documents(embeddings_2, doc_ids_2)
    
    # Perform a search on manager1
    query_embedding_1 = np.random.random((embedding_dim,)).astype('float32')
    results = manager1.search(query_embedding_1)

    print("Distances from manager1:", results)
    
    # Perform a search on manager2
    query_embedding_2 = np.random.random((embedding_dim,)).astype('float32')
    results = manager2.search(query_embedding_2)

    print("Distances from manager2:", results)

    
    # Save indexes to disk
    manager1.save_index("index1.index", "mapping1.pkl")
    manager2.save_index("index2.index", "mapping2.pkl")
    manager3.save_index("index3.index", "mapping3.pkl")
    
    # Load indexes from disk
    manager1.load_index("index1.index", "mapping1.pkl")
    manager2.load_index("index2.index", "mapping2.pkl")
    manager3.load_index("index3.index", "mapping3.pkl")
    
    # Perform the search again to verify
    results = manager1.search(query_embedding_1)
    print("Distances from manager1 after loading:", results)


Distances from manager1: [(18.08528, 'doc2'), (19.313034, 'doc3'), (19.591436, 'doc1')]
Distances from manager2: [(18.968998, 'doc6'), (21.379908, 'doc4'), (22.222712, 'doc5')]
Distances from manager1 after loading: [(18.08528, 'doc2'), (19.313034, 'doc3'), (19.591436, 'doc1')]


In [14]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

#### Download scifact.zip dataset and unzip the dataset
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = '/Users/mishael/Desktop/SLMs-based-RAG/datasets'
data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

/Users/mishael/Desktop/SLMs-based-RAG/datasets/scifact.zip: 100%|██████████| 2.69M/2.69M [00:00<00:00, 7.63MiB/s]
100%|██████████| 5183/5183 [00:00<00:00, 148479.81it/s]


In [39]:
from sentence_transformers import SentenceTransformer

models_name = "Snowflake/snowflake-arctic-embed-m"

model = SentenceTransformer(models_name)    

model_manager = SingleIndexManager(model.get_sentence_embedding_dimension())




In [40]:

# index all the documents
docs_embeddings = model.encode([doc['title'] + ' ' + doc['text'] for doc in corpus.values()])
docs_ids = list(corpus.keys())
model_manager.add_documents(docs_embeddings, docs_ids)


In [41]:
# Perform a search on the indexed documents
query = "What is the incubation period of COVID-19?"
query_embedding = model.encode(query)
results = model_manager.search(query_embedding)
results

[(0.70065176, '29495185'),
 (0.7251548, '22635278'),
 (0.78605103, '23531592'),
 (0.80004704, '13042119'),
 (0.85905784, '2762601')]

In [51]:
def calculate_ir_metrics(result_ids, relevant_ids):
    relevant_set = set(relevant_ids)
    num_relevant = len(relevant_set)
    
    if num_relevant == 0:
        return {"ap": 0.0, "p3": 0.0, "p5": 0.0, "recall": 0.0}
    
    hits = 0
    sum_precisions = 0.0
    p3, p5 = 0.0, 0.0
    
    for i, result in enumerate(result_ids, 1):
        if result in relevant_set:
            hits += 1
        precision = hits / i
        sum_precisions += precision
            
        if i == 3:
            p3 = precision
        elif i == 5:
            p5 = precision
    
    ap = sum_precisions / num_relevant
    recall = hits / num_relevant
    
    return {
        "ap": ap,
        "p3": p3,
        "p5": p5,
        "recall": recall
    }

# Example usage
result_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
relevant_ids = [1, 3, 5, 7, 9]
metrics = calculate_ir_metrics(result_ids, relevant_ids)
print(metrics)

{'ap': 1.1787301587301586, 'p3': 0.6666666666666666, 'p5': 0.6, 'recall': 1.0}


In [52]:
import pandas as pd
# Calculate IR metrics for all queries
ir_metrics = []


for query_id, query in queries.items():
    query_embedding = model.encode(query)
    results = model_manager.search(query_embedding)
    result_ids = [doc_id for _, doc_id in results]
    relevant_ids = [doc_id for doc_id in qrels[query_id]]
    
    metrics = calculate_ir_metrics(result_ids, relevant_ids)
    ir_metrics.append(metrics)

metric_df = pd.DataFrame(ir_metrics)

metric_df

Unnamed: 0,ap,p3,p5,recall
0,0.000000,0.000000,0.0,0.00
1,0.000000,0.000000,0.0,0.00
2,0.000000,0.000000,0.0,0.00
3,0.000000,0.000000,0.0,0.00
4,0.000000,0.000000,0.0,0.00
...,...,...,...,...
295,0.195833,0.333333,0.2,0.25
296,0.000000,0.000000,0.0,0.00
297,0.000000,0.000000,0.0,0.00
298,0.000000,0.000000,0.0,0.00


In [54]:
# Calculate the average metrics

avg_metrics = metric_df.mean()

avg_metrics

ap        0.182597
p3        0.035556
p5        0.036000
recall    0.167500
dtype: float64

In [56]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict

def combsum_fusion(ranked_lists):
    """
    Perform CombSum fusion on multiple ranked lists with efficient score normalization.
    
    :param ranked_lists: A list of lists, where each inner list contains tuples of (doc_id, score)
    :return: A list of tuples (doc_id, fused_score) sorted by fused_score in descending order
    """
    # Create a set of all unique document IDs
    all_doc_ids = set(doc_id for ranked_list in ranked_lists for doc_id, _ in ranked_list)
    
    # Create a dictionary to map document IDs to their index in the numpy array
    doc_id_to_index = {doc_id: i for i, doc_id in enumerate(all_doc_ids)}
    
    # Initialize a numpy array to hold all scores
    scores_array = np.zeros((len(all_doc_ids), len(ranked_lists)))
    
    # Fill the scores array
    for list_idx, ranked_list in enumerate(ranked_lists):
        for doc_id, score in ranked_list:
            scores_array[doc_id_to_index[doc_id], list_idx] = score
    
    # Normalize scores using MinMaxScaler
    scaler = MinMaxScaler()
    normalized_scores = scaler.fit_transform(scores_array)
    
    # Sum up the normalized scores
    fused_scores = np.sum(normalized_scores, axis=1)
    
    # Create the fused list of (doc_id, score) tuples
    fused_list = [(doc_id, fused_scores[idx]) for doc_id, idx in doc_id_to_index.items()]
    
    # Sort the fused list by score in descending order
    fused_list.sort(key=lambda x: x[1], reverse=True)
    
    return fused_list

# Example usage
list1 = [(1, 90), (2, 80), (3, 70), (4, 60)]
list2 = [(2, 0.85), (3, 0.8), (1, 0.75), (5, 0.7)]
list3 = [(3, 950), (1, 900), (4, 850), (2, 800)]

ranked_lists = [list1, list2, list3]
fused_result = combsum_fusion(ranked_lists)
print("Fused and ranked result:")
for doc_id, score in fused_result:
    print(f"Document ID: {doc_id}, Fused Score: {score:.4f}")

Fused and ranked result:
Document ID: 1, Fused Score: 2.8297
Document ID: 2, Fused Score: 2.7310
Document ID: 3, Fused Score: 2.7190
Document ID: 4, Fused Score: 1.5614
Document ID: 5, Fused Score: 0.8235


In [57]:
def rrf_fusion(ranked_lists, k=60):
    """
    Perform Reciprocal Rank Fusion on multiple ranked lists.
    
    :param ranked_lists: A list of lists, where each inner list contains tuples of (doc_id, score)
    :param k: The constant in the RRF formula (default is 60 as per the original paper)
    :return: A list of tuples (doc_id, fused_score) sorted by fused_score in descending order
    """
    fused_scores = defaultdict(float)
    
    for ranked_list in ranked_lists:
        for rank, (doc_id, _) in enumerate(ranked_list, start=1):
            fused_scores[doc_id] += 1 / (k + rank)
    
    # Sort the fused list by score in descending order
    fused_list = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    
    return fused_list

# Example usage
list1 = [(1, 0.9), (2, 0.8), (3, 0.7), (4, 0.6)]
list2 = [(2, 0.85), (3, 0.8), (1, 0.75), (5, 0.7)]
list3 = [(3, 0.95), (1, 0.9), (4, 0.85), (2, 0.8)]

ranked_lists = [list1, list2, list3]
fused_result = rrf_fusion(ranked_lists)
print("Fused and ranked result using RRF:")
for doc_id, score in fused_result:
    print(f"Document ID: {doc_id}, Fused Score: {score:.4f}")

Fused and ranked result using RRF:
Document ID: 1, Fused Score: 0.0484
Document ID: 3, Fused Score: 0.0484
Document ID: 2, Fused Score: 0.0481
Document ID: 4, Fused Score: 0.0315
Document ID: 5, Fused Score: 0.0156


In [None]:

def get_queries_results(queries, model_manager, output_path):
    result_dict = {}
    for query_id, query in queries.items():
        query_embedding = model.encode(query)
        results = model_manager.search(query_embedding)
        result_dict[query_id] = results
    save_json(result_dict, output_path)

    
        
        
        



In [59]:
import json

def save_json(data, output_path):
    with open(output_path, "w") as f:
        json.dump(data, f, indent=4)

def load_json(input_path):
    with open(input_path, "r") as f:
        data = json.load(f)
    return data 

In [None]:
COMBSUM_RESULTS_PATH = "combsum_results.json"
RRF_RESULTS_PATH = "rrf_results.json"

def comb_results(queries_result_folder, output_path):
    comb_results = {}
    rrf_result = {}
    queries_results = {json_path.stem: load_json(json_path) for json_path in queries_result_folder.iterdir()}

    first_signal = list(queries_results.keys())[0]
    for query_id in queries_results[first_signal]:
        ranked_lists = [queries_results[signal][query_id] for signal in queries_results]
        comb_results[query_id] = combsum_fusion(ranked_lists)
        rrf_result[query_id] = rrf_fusion(ranked_lists)

    save_json(comb_results, output_path / COMBSUM_RESULTS_PATH)
    save_json(rrf_result, output_path / RRF_RESULTS_PATH)
        


In [63]:
a = {1: [(1, 0.9), (2, 0.8), (3, 0.7), (4, 0.6)], 2: [(2, 0.85), (3, 0.8), (1, 0.75), (5, 0.7)]}

list(a.keys())[0]


1