In [1]:
# Proprietary library
import aux_document_retrieval_bm25 as aux_bm25
import aux_document_retrieval_vsm as aux_vsm
import aux_semantic_search as aux_semantics
import aux_retrieval_evaluation as aux_retrieval
import aux_document_retrieval_hybrid as aux_hybrid
from dataclass import QueryResult

from typing import  Dict, Any, Union
import dataclass as data
import pandas as pd
from pathlib import Path
import logging
import nltk
import json
import importlib

# Reload Modules
importlib.reload(aux_vsm)
importlib.reload(aux_bm25)
importlib.reload(aux_semantics)
importlib.reload(aux_retrieval)

# For word processing
nltk.download('punkt_tab')

DocLike = Union[data.RetrievedDocument, Dict[str, Any]]

2025-06-13 12:35:44,065 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.
[nltk_data] Downloading package punkt_tab to /home/pablo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Basic enviroment setup

logging.basicConfig(
    level=logging.ERROR,
    format="%(asctime)s %(name)s [%(levelname)s] %(message)s",
    force=True
)
logger = logging.getLogger(__name__)

paths = {
    'word2vec': Path("../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin"),
    'idf_cache': Path("../02-data/03-VSM/idf_cache_path.pkl"),
    'word2vec_vsm_multivector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-1.pkl"),
    'word2vec_vsm_singlevector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-0.pkl"),
    'file': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/1cr2.txt"),
    'output_path': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/sentence_expansions.txt"),
    'pdf_folder': Path("../02-data/00-testing/"),
    'retriever': Path("../02-data/05-Retrieval/corpus_bm25"),
    'queries_1' : Path("../02-data/06-Evaluation/query_evaluation_1.json"),
    'queries_2' : Path("../02-data/06-Evaluation/query_evaluation_2.json"),
    'ranking_query_evaluation_1' : Path('../02-data/06-Evaluation/ranking_query_evaluation_1.pkl'), ## using queries_1, singlevector, QE, minmax
    'ranking_query_evaluation_2' : Path('../02-data/06-Evaluation/ranking_query_evaluation_2.pkl'), ## using queries_2
    'ranking_query_evaluation_3' : Path('../02-data/06-Evaluation/ranking_query_evaluation_3.pkl'), ## using queries_1, multivector, QE, minmax
    'ranking_query_evaluation_4' : Path('../02-data/06-Evaluation/ranking_query_evaluation_4.pkl'), ## using queries_1, multivector, no QE, minmax
    'ranking_query_evaluation_5' : Path('../02-data/06-Evaluation/ranking_query_evaluation_5.pkl'), ## using queries_1, multivector, QE, zscore
    'ranking_query_evaluation_6' : Path('../02-data/06-Evaluation/ranking_query_evaluation_6.pkl'), ## using queries_1, multivector, QE, minmax/zscore
    'QE_eval' : Path("../02-data/06-Evaluation/QE_eval.pkl"),           ## query1 , multivector , minmax/zscore , QE
    'no_QE_eval' : Path("../02-data/06-Evaluation/no_QE_eval.pkl"),        ## query1 , multivector , minmax/zscore , no QE
    'single_eval' : Path("../02-data/06-Evaluation/single_eval.pkl"),       ## query1 , singlevector , minmax/zscore , QE
    'multi_eval' : Path("../02-data/06-Evaluation/multi_eval.pkl")         ## query1 , multivector , minmax/zscore , QE

}

save_file = str(paths['multi_eval'])

use_multivector = True
use_expansion = True

norm_vsm = 'minmax'
norm_bm25 = 'zscore'

top_k = 100

with open(str(paths['queries_1'])) as f:
    raw = json.load(f)

query_list_documents = [data.DocumentSection(**entry) for entry in raw]

--------------------------------
## VSM evaluation. Works both for testing multivecror and singlevector


In [3]:
importlib.reload(aux_vsm)
importlib.reload(aux_retrieval)

vsm_records = []

resources = aux_vsm.load_word2vec_resources(paths, use_multivector=use_multivector)

for section in query_list_documents:
    w2v_result: aux_vsm.Word2VecQueryResult = aux_vsm.run_word2vec_query_preloaded(
        resources,
        section.query,
        top_k=top_k,
        use_expansion=use_expansion
    )

    vsm_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": w2v_result
    })
    
#compute statistics for evaluation
vsm_statistics = aux_retrieval.compute_query_run_stats(vsm_records)
display(vsm_statistics['per_record'])
display(vsm_statistics['score_stats'])

#if you want to look at one specific record
vsm_data = pd.DataFrame(data = vsm_records[0]['result'].results.documents)
display(vsm_data)

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,19.0,0.710978,3,6,12
1,cpi-2212-85pm,alarms-buzzers-and-sirens,,,1,2,3
2,106765806,cable-ties-zip-ties,11.0,0.584506,4,7,13
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,90.0,0.646548,1,5,7
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,,,5,9,12
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,24.0,0.558128,5,7,14
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,4.0,0.674221,5,10,19
7,ces-20134-088pm,speakers,,,2,5,12
8,LR43-DATASHEET,batteries-non-rechargable-primary,,,4,9,19
9,cds-25148,speakers,39.0,0.45637,2,5,13


Unnamed: 0,max,min,median
0,0.710978,0.45637,0.615527


Unnamed: 0,rank,doc_id,label,score
0,1,d9e2b7a7-5099-4612-b90a-796f8295ffd5,microphones,0.738324
1,2,8373d9aa-424a-4cf8-add9-8655f7a6700f,microphones,0.732933
2,3,SR41-392-384-DATASHEET,batteries-non-rechargable-primary,0.728945
3,4,392-384z,batteries-non-rechargable-primary,0.726830
4,5,pana-cr1220-ds,batteries-non-rechargable-primary,0.724038
...,...,...,...,...
95,96,AT-1224-TWT-12V-2-R,alarms-buzzers-and-sirens,0.686392
96,97,cms-181345-18s-x8,speakers,0.686356
97,98,LR621SPEC,batteries-non-rechargable-primary,0.686355
98,99,cms-151125-078x-67,speakers,0.686349


----------------
### Small test on the effects of query expansion


In [4]:
for result in vsm_records:
    w2v_result = result['result']

    query_embedding = w2v_result.query_info
    
    print(query_embedding.query)
    print(query_embedding.expansions)
    print("------------------------")

this battery contains positive temperature coefficient element
['batteries', 'includes', 'negative', 'temperatures', 'coefficients', 'component', 'aspect', 'elements']
------------------------
buzzer with operating voltage 15 24 vdc
['voltages', 'inductor', 'capacitance']
------------------------
flexible tie body that safely  releasable and reusable
['bodies', 'it', 'reuseable']
------------------------
insulation is made of  plastic material and has  thickness of about 1 mm
['insulating', 'was', 'making', 'make', 'materials', 'thicknesses', 'width', 'millimeter', '6mm', 'cm', '7mm', '3mm', '1mm']
------------------------
busadapter supports protocol for profinet io
['supported', 'opposes', 'protocols']
------------------------
the system consists of  microphone  speaker and  sound card, the microphone is connected to the sound card which is connected to the speaker
['systems', 'comprises', 'consisting', 'consist', 'comprised', 'includes', 'comprise', 'encompasses', 'comprising', 'mic

------
## BM25 evaluation


In [6]:
importlib.reload(aux_bm25)
importlib.reload(aux_retrieval)
logger = logging.getLogger(__name__)

bm25_records = []

for section in query_list_documents:
    bm25_result: aux_bm25.BM25QueryResult = aux_bm25.run_bm25_query(
        paths,
        section.query,
        top_k=top_k,
    )
    
    bm25_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": bm25_result
    })

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
#compute statistics for evaluation
bm25_statistics = aux_retrieval.compute_query_run_stats(bm25_records)
display(bm25_statistics['per_record'])
display(bm25_statistics['score_stats'])

#if you want to look at one specific record
bm25_data = pd.DataFrame(data = bm25_records[0]['result'].results.documents)
display(bm25_data)

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,2,5.740847,4,8,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1,6.938643,5,10,20
2,106765806,cable-ties-zip-ties,1,12.732177,5,8,15
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,1,6.910951,3,6,8
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,1,13.478672,5,9,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,2,13.271488,5,9,19
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,2,16.425482,5,10,20
7,ces-20134-088pm,speakers,83,2.805447,5,10,20
8,LR43-DATASHEET,batteries-non-rechargable-primary,10,4.373617,5,10,20
9,cds-25148,speakers,14,2.79729,5,10,19


Unnamed: 0,max,min,median
0,16.425482,2.79729,6.924797


Unnamed: 0,rank,doc_id,label,score
0,1,123,batteries-non-rechargable-primary,5.875960
1,2,1cr2,batteries-non-rechargable-primary,5.740847
2,3,cms-402008-18sp,speakers,4.363099
3,4,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,4.257775
4,5,alkaline_appman,batteries-non-rechargable-primary,4.058401
...,...,...,...,...
95,96,8024d901-2c69-4d26-b0cf-6e1bdb74ff9b,microphones,1.964044
96,97,LN92-Industrial-0920,batteries-non-rechargable-primary,1.949623
97,98,ces-803118-28pm,speakers,1.941543
98,99,EM-6050-14-Feb-2019,microphones,1.902635


--------------------------
## RRF Evaluation


In [98]:
logger = logging.getLogger(__name__)

rrf_records = []

for i, section in enumerate(query_list_documents):
    
    df_bm25 = pd.DataFrame([d.__dict__ for d in bm25_records[i]["result"].results.documents])
    df_vsm  = pd.DataFrame([d.__dict__ for d in vsm_records[i]["result"].results.documents])

    rrf_result = aux_retrieval.rrf_from_dfs([df_bm25, df_vsm], rrf_k=60, top_k=100)

    rrf_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": rrf_result
    })
    
rrf_statistics = aux_retrieval.compute_query_run_stats(rrf_records)
display(rrf_statistics['per_record'])
display(rrf_statistics['score_stats'])

#if you want to look at one specific record
rrf_data = pd.DataFrame(data = rrf_records[0]['result'].results.documents)
display(rrf_data)
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,2.0,0.028787,3,5,13
1,cpi-2212-85pm,alarms-buzzers-and-sirens,26.0,0.016393,5,10,17
2,106765806,cable-ties-zip-ties,3.0,0.030478,5,9,15
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,7.0,0.02306,2,5,8
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,23.0,0.016393,5,10,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,7.0,0.028034,5,10,18
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,1.0,0.031754,5,10,20
7,ces-20134-088pm,speakers,,,5,10,19
8,LR43-DATASHEET,batteries-non-rechargable-primary,64.0,0.014286,5,10,19
9,cds-25148,speakers,9.0,0.023615,4,8,18


Unnamed: 0,max,min,median
0,0.031754,0.014286,0.023615


Unnamed: 0,rank,doc_id,label,score
0,1,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.030550
1,2,1cr2,batteries-non-rechargable-primary,0.028787
2,3,8373d9aa-424a-4cf8-add9-8655f7a6700f,microphones,0.027240
3,4,cms-402008-18sp,speakers,0.026626
4,5,LR927SPEC,batteries-non-rechargable-primary,0.025575
...,...,...,...,...
95,96,7c888c83-eb72-42df-a195-20bf39d6ad9b,speakers,0.009091
96,97,AAA4000C182,batteries-non-rechargable-primary,0.008929
97,98,DELTA_IA-TC_DTB_OM_EN_20201202,controller-accesories,0.008850
98,99,ces-26138-16l030,speakers,0.008850


----------------------
## Hybrid Search Evaluation


In [13]:
importlib.reload(aux_retrieval)
importlib.reload(data)
logger = logging.getLogger(__name__)

hybrid_records = []
norm_vsm = 'zscore'
norm_bm25 = 'minmax'

for i, section in enumerate(query_list_documents):
    bm25_list = [d.__dict__ for d in bm25_records[i]['result'].results.documents]
    vsm_list  = [d.__dict__ for d in vsm_records[i]['result'].results.documents]
    
    hybrid_result:QueryResult = aux_retrieval.hybrid_search(
        bm25_results= bm25_list,
        vsm_results=  vsm_list,
        weight_bm25=  0.7,
        weight_vsm=   0.3,
        top_k=        100,
        norm_bm25=    norm_bm25,
        norm_vsm=     norm_vsm
    )

    hybrid_records.append({
        "doc":    section.doc,
        "label":  section.label,
        "query":  section.query,
        "result": hybrid_result
    })

In [14]:
hybrid_statistics3 = aux_retrieval.compute_query_run_stats(hybrid_records)
display(hybrid_statistics3['per_record'])
display(hybrid_statistics3['score_stats'])

#if you want to look at one specific record
hybrid_data = pd.DataFrame(data = hybrid_records[0]['result'].results.documents)
display(hybrid_data)
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,4,0.958672,3,6,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,9,0.7,2,6,15
2,106765806,cable-ties-zip-ties,1,1.192675,4,8,14
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,14,0.403128,2,4,10
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,6,0.7,5,10,15
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,9,0.852338,5,10,17
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,1,1.42992,5,10,20
7,ces-20134-088pm,speakers,77,0.257165,4,9,17
8,LR43-DATASHEET,batteries-non-rechargable-primary,17,0.426781,5,9,19
9,cds-25148,speakers,25,0.354054,4,6,12


Unnamed: 0,max,min,median
0,1.42992,0.257165,0.7


Unnamed: 0,rank,doc_id,label,score
0,1,d9e2b7a7-5099-4612-b90a-796f8295ffd5,microphones,1.040747
1,2,8373d9aa-424a-4cf8-add9-8655f7a6700f,microphones,0.978891
2,3,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.965849
3,4,1cr2,batteries-non-rechargable-primary,0.958672
4,5,SR41-392-384-DATASHEET,batteries-non-rechargable-primary,0.745547
...,...,...,...,...
95,96,MN1300_US_CT1,batteries-non-rechargable-primary,0.031880
96,97,96PR-102-UBM3-M_datasheet20190707090023,printers-label-makers,0.029852
97,98,Toshiba%20Tech%20Data%20LR44%202022,batteries-non-rechargable-primary,0.028722
98,99,Zeus_Alkaline_C_RevV2_spec_sheet,batteries-non-rechargable-primary,0.024920


In [16]:
import pickle
save_file = '../02-data/06-Evaluation/norm_eval.pkl'
dfs = {
    'minmax/zs': hybrid_statistics['per_record'],
    'minmax/minmax' : hybrid_statistics1['per_record'],
    'zs/zs' : hybrid_statistics2['per_record'],
    'zs/minmax' : hybrid_statistics3['per_record']
}
with open(save_file, 'wb') as f:
    pickle.dump(dfs, f)

--------------------------
## Rerank evaluation ( BM25 - VSM )


In [101]:
importlib.reload(aux_retrieval)
importlib.reload(data)
logger = logging.getLogger(__name__)

rerank_records = []
top_k = 50
use_multivector = True

for section in query_list_documents:

    rerank_results:QueryResult = aux_retrieval.run_hybrid_query(
        paths = paths,
        query = section.query,
        top_k = top_k,
        use_multivector = use_multivector
    )

    rerank_records.append({
        "doc":    section.doc,
        "label":  section.label,
        "query":  section.query,
        "result": rerank_results
    })

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

In [102]:
rerank_statistics = aux_retrieval.compute_query_run_stats(rerank_records)
display(rerank_statistics['per_record'])
display(rerank_statistics['score_stats'])

#if you want to look at one specific record
rerank_data = pd.DataFrame(data = rerank_records[0]['result'].results.documents)
display(rerank_data[:5])
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,1.0,0.690095,4,9,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1.0,0.449107,5,10,20
2,106765806,cable-ties-zip-ties,2.0,0.57698,5,8,14
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,7.0,0.698036,1,4,9
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,22.0,0.0,4,9,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,11.0,0.632503,4,9,17
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,3.0,0.663774,5,10,19
7,ces-20134-088pm,speakers,,,5,10,20
8,LR43-DATASHEET,batteries-non-rechargable-primary,38.0,0.0,5,10,20
9,cds-25148,speakers,5.0,0.526992,5,9,16


Unnamed: 0,max,min,median
0,0.698036,0.0,0.57698


Unnamed: 0,rank,doc_id,label,score
0,1,1cr2,batteries-non-rechargable-primary,0.690095
1,2,123,batteries-non-rechargable-primary,0.678843
2,3,8373d9aa-424a-4cf8-add9-8655f7a6700f,microphones,0.6739
3,4,l91,batteries-non-rechargable-primary,0.669383
4,5,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.669118


In [103]:
import pickle

# Save DataFrames
dfs = {
    'hybrid_statistics': hybrid_statistics['per_record'],
    'rrf_statistics': rrf_statistics['per_record'],
    'bm25_statistics': bm25_statistics['per_record'],
    'vsm_statistics': vsm_statistics['per_record'],
    'rerank_statistics': rerank_statistics['per_record']
}
with open(save_file, 'wb') as f:
    pickle.dump(dfs, f)

--------------
### Run all models and get the results back

In [104]:
import pickle

run = False

if run:
    
    statistics_results = aux_retrieval.run_all_models(query_list_documents, paths, use_expansion=use_expansion, use_multivector=use_multivector,)
    display(statistics_results['bm25_statistics'])
    
    with open(save_path, 'wb') as f:
        pickle.dump(statistics_results, f)