Fusion in Hybrid Search

In [15]:
def normalize_scores(scores: dict) -> dict:
    """
    Normalize a dictionary of scores to the range [0, 1].

    The normalization is performed using the formula:
    normalized_score = (score - min_score) / (max_score - min_score)
    """
    min_score = min(scores.values())
    max_score = max(scores.values())

    return {
        doc: (score - min_score) / (max_score - min_score) if max_score > min_score else 0
        for doc, score in scores.items()
    }

In [16]:
from collections import defaultdict
from typing import List, Dict, Tuple

def fusion(results: List[Dict[str, float]]) -> List[Tuple[str, float]]:
    """
    Fuse multiple result sets by normalizing their scores, aggregating them,
    and returning a ranked list of documents based on combined scores.
    """
    normalized_results = [normalize_scores(result) for result in results]

    combined_scores = defaultdict(float)
    for norm_result in normalized_results:
        for doc, score in norm_result.items():
            combined_scores[doc] += score / len(results)

    ranked_docs = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    return ranked_docs

In [17]:
# Example inputs
bm25_results = {'doc1': 2.5, 'doc2': 1.8, 'doc3': 1.2}
vector_similarity_results = {'doc2': 0.8, 'doc4': 0.7, 'doc1': 0.6}

# Fusion
final_ranking = fusion([bm25_results, vector_similarity_results])

# Output final ranked documents
print("Final Ranking:", final_ranking)

Final Ranking: [('doc2', 0.7307692307692308), ('doc1', 0.5), ('doc4', 0.24999999999999986), ('doc3', 0.0)]
