In [1]:
import os
import sys

module_path = os.path.join(os.getcwd(), '../', 'src')
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from hybrid_search.mapper import DictMapper

file_path = '../data/finance_template_map.xlsx'
sheet_name = 'Income Statement'
dmap = DictMapper(file_path, sheet_name)

target_column = 'C'
base_source_columns = ['F','G']
test_source_columns = ['H','I','J']

base_mapping = dmap.create_mapping_dict(base_source_columns, target_column)
test_mapping = dmap.create_mapping_dict(test_source_columns, target_column)


In [3]:
from hybrid_search.search import HybridSearch
sbert_model_name = "uonyeka/bge-base-financial-matryoshka"
engine = HybridSearch(base_mapping, transformer_model=sbert_model_name)

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /home/nunenuh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nunenuh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import pandas as pd
results = engine.hybrid_search("venture", top_n=5, bm25_weight=1, transformer_weight=1)

result_data = []
for result in results:
    result_data.append({
            "acc_name": result[0],
            "pref_acc_name": result[2],
            "score": f"{result[1]:.4f}",
        })

pd.DataFrame(result_data)


Unnamed: 0,acc_name,pref_acc_name,score
0,Share of results of joint ventures and associates,Share of profits from associates,2.4629
1,R&D,Research and Development Expenses,1.0
2,Purchases,Cost of Sales,0.5384
3,Marketing,Sales and Marketing Expenses,0.3747
4,Professional and Outside Services,General and Admin Expenses,0.0


In [10]:
from hybrid_search import evaluation
# Evaluate and print the results for each search method
bm25_eval_results, bm25_accuracy = evaluation.evaluate_search_accuracy(test_mapping, engine, engine.bm25_search)
print(f"BM25 Accuracy: {bm25_accuracy:.2f}%")

transformer_eval_results, transformer_accuracy = evaluation.evaluate_search_accuracy(test_mapping, engine, engine.transformer_search)
print(f"Transformer Accuracy: {transformer_accuracy:.2f}%")

hybrid_eval_results, hybrid_accuracy = evaluation.evaluate_search_accuracy(test_mapping, engine, engine.hybrid_search)
print(f"Hybrid Accuracy: {hybrid_accuracy:.2f}%")


BM25 Accuracy: 38.46%
Transformer Accuracy: 53.85%
Hybrid Accuracy: 53.85%


In [11]:
hybrid_eval_results.head()

Unnamed: 0,Key,Predicted,Ground Truth,Correct,Score
0,Financing revenues,Revenue,Revenue,True,3.7364
1,Producing and manufacturing cost,Cost of Sales,Cost of Sales,True,2.1997
2,"Selling, administrative and general",General and Admin Expenses,Sales and Marketing Expenses,False,0.5473
3,General and administrative,General and Admin Expenses,General and Admin Expenses,True,0.5459
4,employee benefits,General and Admin Expenses,Personnel and Benefit Expenses,False,0.4663


In [16]:
# Generate labels for the hybrid search method
true_labels, hybrid_predicted_labels = evaluation.generate_labels(test_mapping, engine.hybrid_search)

# Print evaluation metrics
evaluation.print_evaluation_metrics(true_labels, hybrid_predicted_labels)

Confusion Matrix Value:
-------------------------
True Positive   : 14
False Positive  : 12
False Negative  : 12
True Negative   : 378


Evaluation Metrics:
--------------------
Precision  0.4375
Recall     0.4271
F1 Score   0.3965
Accuracy   0.5385
