In [None]:
# Proprietary library
import aux_document_retrieval_bm25 as aux_bm25
import aux_document_retrieval_vsm as aux_vsm
import aux_semantic_search as aux_semantics
import aux_retrieval_evaluation as aux_retrieval
import aux_document_retrieval_hybrid as aux_hybrid
from dataclass import QueryResult

from typing import  Dict, Any, Union
import dataclass as data
import pandas as pd
from pathlib import Path
import logging
import nltk
import json
import importlib

# Reload Modules
importlib.reload(aux_vsm)
importlib.reload(aux_bm25)
importlib.reload(aux_semantics)
importlib.reload(aux_retrieval)

# For word processing
nltk.download('punkt_tab')

DocLike = Union[data.RetrievedDocument, Dict[str, Any]]

In [None]:
# Basic enviroment setup

logging.basicConfig(
    level=logging.ERROR,
    format="%(asctime)s %(name)s [%(levelname)s] %(message)s",
    force=True
)
logger = logging.getLogger(__name__)

paths = {
    'word2vec': Path("../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin"),
    'idf_cache': Path("../02-data/03-VSM/idf_cache_path.pkl"),
    'word2vec_vsm_multivector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-1.pkl"),
    'word2vec_vsm_singlevector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-0.pkl"),
    'file': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/1cr2.txt"),
    'output_path': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/sentence_expansions.txt"),
    'pdf_folder': Path("../02-data/00-testing/"),
    'retriever': Path("../02-data/05-Retrieval/corpus_bm25")
}

with open("../02-data/06-Evaluation/document_queries.json") as f:
    raw = json.load(f)

query_list_documents = [data.DocumentSection(**entry) for entry in raw]

--------------------------------
## VSM evaluation. Works both for testing multivecror and singlevector


In [20]:
importlib.reload(aux_vsm)
importlib.reload(aux_retrieval)

vsm_records = []
top_k = 100
use_expansion = True
use_multivector = False

resources = aux_vsm.load_word2vec_resources(paths, use_multivector=use_multivector)

for section in query_list_documents:
    w2v_result: aux_vsm.Word2VecQueryResult = aux_vsm.run_word2vec_query_preloaded(
        resources,
        section.query,
        top_k=top_k,
        use_expansion=use_expansion
    )

    vsm_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": w2v_result
    })
    
#compute statistics for evaluation
vsm_statistics = aux_retrieval.compute_query_run_stats(vsm_records)
display(vsm_statistics['per_record'])
display(vsm_statistics['score_stats'])

#if you want to look at one specific record
vsm_data = pd.DataFrame(data = vsm_records[0]['result'].results.documents)
display(vsm_data)

2025-05-27 14:42:27,968 - root - INFO - Loading pre-trained Google News Word2Vec model...

2025-05-27 14:42:27,969 - gensim.models.keyedvectors - INFO - loading projection weights from ../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin
2025-05-27 14:42:57,934 - gensim.utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2025-05-27T14:42:57.934492', 'gensim': '4.3.2', 'python': '3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-138-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}
2025-05-27 14:42:57,956 - root - INFO - Loading cached IDF dictionary from ../02-data/03-VSM/idf_cache_path.pkl... 

2025-05-27 14:43:04,167 - aux_document_retrieval_vsm - INFO - Query embedding computed: 300 dims
2025-05-27 14:43:04,220 - aux_document_retrieval_vsm - INFO

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,13.0,0.710978,3,7,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,89.0,0.540444,1,1,4
2,106765806,cable-ties-zip-ties,22.0,0.56538,4,9,15
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,,,1,4,5
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,,,5,9,18
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,,,0,0,1
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,4.0,0.674221,5,10,19
7,cds-13138-smt,speakers,,,0,0,1
8,LR43-DATASHEET,batteries-non-rechargable-primary,,,4,9,19
9,cds-25148,speakers,,,4,9,14


Unnamed: 0,max,min,median
0,0.710978,0.540444,0.6198


Unnamed: 0,rank,doc_id,label,score
0,1,SR41-392-384-DATASHEET,batteries-non-rechargable-primary,0.728945
1,2,392-384z,batteries-non-rechargable-primary,0.726830
2,3,GT-0905A-19-Feb-2019,alarms-buzzers-and-sirens,0.723888
3,4,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.718862
4,5,MU064602-1,microphones,0.716878
...,...,...,...,...
95,96,cms-16093-078x-67,speakers,0.685307
96,97,cem-1212s,alarms-buzzers-and-sirens,0.685244
97,98,cms-160925-18sp-x8,speakers,0.685211
98,99,cds-15158-smt,speakers,0.685202


------
## BM25 evaluation


In [None]:
importlib.reload(aux_bm25)
importlib.reload(aux_retrieval)
logger = logging.getLogger(__name__)

top_k = 100
bm25_records = []

for section in query_list_documents:
    bm25_result: aux_bm25.BM25QueryResult = aux_bm25.run_bm25_query(
        paths,
        section.query,
        top_k=top_k,
    )
    
    bm25_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": bm25_result
    })

In [19]:
#compute statistics for evaluation
bm25_statistics = aux_retrieval.compute_query_run_stats(bm25_records)
display(bm25_statistics['per_record'])
display(bm25_statistics['score_stats'])

#if you want to look at one specific record
bm25_data = pd.DataFrame(data = bm25_records[0]['result'].results.documents)
display(bm25_data)

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,2.0,5.740847,4,8,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1.0,6.938643,5,10,20
2,106765806,cable-ties-zip-ties,1.0,12.732177,5,8,15
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,1.0,6.910951,3,6,8
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,1.0,13.478672,5,9,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,,,0,0,0
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,2.0,16.425482,5,10,20
7,cds-13138-smt,speakers,2.0,6.421796,1,1,2
8,LR43-DATASHEET,batteries-non-rechargable-primary,10.0,4.373617,5,10,20
9,cds-25148,speakers,14.0,2.79729,5,10,19


Unnamed: 0,max,min,median
0,16.425482,2.79729,6.910951


Unnamed: 0,rank,doc_id,label,score
0,1,123,batteries-non-rechargable-primary,5.875960
1,2,1cr2,batteries-non-rechargable-primary,5.740847
2,3,cms-402008-18sp,speakers,4.363099
3,4,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,4.257775
4,5,alkaline_appman,batteries-non-rechargable-primary,4.058401
...,...,...,...,...
95,96,8024d901-2c69-4d26-b0cf-6e1bdb74ff9b,microphones,1.964044
96,97,LN92-Industrial-0920,batteries-non-rechargable-primary,1.949623
97,98,ces-803118-28pm,speakers,1.941543
98,99,EM-6050-14-Feb-2019,microphones,1.902635


--------------------------
## RRF Evaluation


In [17]:
logger = logging.getLogger(__name__)

rrf_records = []

for i, section in enumerate(query_list_documents):
    
    df_bm25 = pd.DataFrame([d.__dict__ for d in bm25_records[i]["result"].results.documents])
    df_vsm  = pd.DataFrame([d.__dict__ for d in vsm_records[i]["result"].results.documents])

    rrf_result = aux_retrieval.rrf_from_dfs([df_bm25, df_vsm], rrf_k=60, top_k=100)

    rrf_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": rrf_result
    })
    
rrf_statistics = aux_retrieval.compute_query_run_stats(rrf_records)
display(rrf_statistics['per_record'])
display(rrf_statistics['score_stats'])

#if you want to look at one specific record
rrf_data = pd.DataFrame(data = rrf_records[0]['result'].results.documents)
display(rrf_data)
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,3.0,0.029828,2,5,10
1,cpi-2212-85pm,alarms-buzzers-and-sirens,12.0,0.023105,5,10,18
2,106765806,cable-ties-zip-ties,7.0,0.028589,5,9,16
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,21.0,0.016393,2,3,7
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,27.0,0.016393,5,9,18
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,,,0,0,0
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,1.0,0.031754,5,10,20
7,cds-13138-smt,speakers,37.0,0.016129,0,1,1
8,LR43-DATASHEET,batteries-non-rechargable-primary,62.0,0.014286,5,10,19
9,cds-25148,speakers,46.0,0.013514,5,9,18


Unnamed: 0,max,min,median
0,0.031754,0.013514,0.016393


Unnamed: 0,rank,doc_id,label,score
0,1,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.031250
1,2,GT-0905A-19-Feb-2019,alarms-buzzers-and-sirens,0.029958
2,3,1cr2,batteries-non-rechargable-primary,0.029828
3,4,cms-402008-18sp,speakers,0.026290
4,5,a19b940c-ee08-4eed-ad40-067ec064beea,microphones,0.024086
...,...,...,...,...
95,96,cmt-4023s-smt-tr,alarms-buzzers-and-sirens,0.009174
96,97,7c888c83-eb72-42df-a195-20bf39d6ad9b,speakers,0.009091
97,98,MO044202-2,microphones,0.009091
98,99,cms-16093-078x,speakers,0.009009


----------------------
## Hybrid Search Evaluation


In [15]:
importlib.reload(aux_retrieval)
importlib.reload(data)
logger = logging.getLogger(__name__)

hybrid_records = []

for i, section in enumerate(query_list_documents):
    bm25_list = [d.__dict__ for d in bm25_records[i]['result'].results.documents]
    vsm_list  = [d.__dict__ for d in vsm_records[i]['result'].results.documents]
    
    hybrid_result:QueryResult = aux_retrieval.hybrid_search(
        bm25_results= bm25_list,
        vsm_results=  vsm_list,
        weight_bm25=  0.7,
        weight_vsm=   0.3,
        top_k=        100,
        norm_bm25=    'minmax',
        norm_vsm=     'minmax'
    )

    hybrid_records.append({
        "doc":    section.doc,
        "label":  section.label,
        "query":  section.query,
        "result": hybrid_result
    })

In [16]:
hybrid_statistics = aux_retrieval.compute_query_run_stats(hybrid_records)
display(hybrid_statistics['per_record'])
display(hybrid_statistics['score_stats'])

#if you want to look at one specific record
hybrid_data = pd.DataFrame(data = hybrid_records[0]['result'].results.documents)
display(hybrid_data)
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,1.0,0.853361,3,7,13
1,cpi-2212-85pm,alarms-buzzers-and-sirens,5.0,0.714446,5,10,20
2,106765806,cable-ties-zip-ties,1.0,0.819747,5,9,16
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,1.0,0.7,3,6,7
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,2.0,0.7,5,10,18
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,,,0,0,0
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,1.0,0.96258,5,10,20
7,cds-13138-smt,speakers,9.0,0.67885,0,1,2
8,LR43-DATASHEET,batteries-non-rechargable-primary,14.0,0.426781,5,10,20
9,cds-25148,speakers,16.0,0.352707,5,10,18


Unnamed: 0,max,min,median
0,0.96258,0.352707,0.7


Unnamed: 0,rank,doc_id,label,score
0,1,1cr2,batteries-non-rechargable-primary,0.853361
1,2,123,batteries-non-rechargable-primary,0.740523
2,3,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.646206
3,4,cms-402008-18sp,speakers,0.534685
4,5,GT-0905A-19-Feb-2019,alarms-buzzers-and-sirens,0.501830
...,...,...,...,...
95,96,cr2025,batteries-non-rechargable-primary,0.069932
96,97,MO044202-2,microphones,0.069399
97,98,cms-16093-078x,speakers,0.069379
98,99,cr2016,batteries-non-rechargable-primary,0.067941


--------------------------
## Rerank evaluation ( BM25 - VSM )


In [None]:
importlib.reload(aux_retrieval)
importlib.reload(data)
logger = logging.getLogger(__name__)

rerank_records = []
top_k = 50
use_multivector = True

for section in query_list_documents:

    rerank_results:QueryResult = aux_retrieval.run_hybrid_query(
        paths = paths,
        query = section.query,
        top_k = top_k,
        use_multivector = use_multivector
    )

    rerank_records.append({
        "doc":    section.doc,
        "label":  section.label,
        "query":  section.query,
        "result": rerank_results
    })

In [14]:
rerank_statistics = aux_retrieval.compute_query_run_stats(rerank_records)
display(rerank_statistics['per_record'])
display(rerank_statistics['score_stats'])

#if you want to look at one specific record
rerank_data = pd.DataFrame(data = rerank_records[0]['result'].results.documents)
display(rerank_data)
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,1.0,0.690095,4,9,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1.0,0.449107,5,10,20
2,106765806,cable-ties-zip-ties,2.0,0.57698,5,8,14
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,7.0,0.698036,1,4,9
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,22.0,0.0,4,9,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,,,0,0,0
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,3.0,0.663774,5,10,19
7,cds-13138-smt,speakers,35.0,0.0,0,1,3
8,LR43-DATASHEET,batteries-non-rechargable-primary,38.0,0.0,5,10,20
9,cds-25148,speakers,5.0,0.526992,5,9,16


Unnamed: 0,max,min,median
0,0.698036,0.0,0.526992


Unnamed: 0,rank,doc_id,label,score
0,1,1cr2,batteries-non-rechargable-primary,0.690095
1,2,123,batteries-non-rechargable-primary,0.678843
2,3,8373d9aa-424a-4cf8-add9-8655f7a6700f,microphones,0.6739
3,4,l91,batteries-non-rechargable-primary,0.669383
4,5,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.669118
5,6,l92,batteries-non-rechargable-primary,0.663535
6,7,SCR1_3N-BTI,batteries-non-rechargable-primary,0.663419
7,8,LR1130SPEC,batteries-non-rechargable-primary,0.657868
8,9,SCR2-BTI,batteries-non-rechargable-primary,0.656875
9,10,SCR17335A_719-BTI,batteries-non-rechargable-primary,0.656388
