In [14]:
# Proprietary library
import aux_document_retrieval_bm25 as aux_bm25
import aux_document_retrieval_vsm as aux_vsm
import aux_semantic_search as aux_semantics
import aux_retrieval_evaluation as aux_retrieval
import aux_document_retrieval_hybrid as aux_hybrid
from dataclass import QueryResult

from typing import  Dict, Any, Union
import dataclass as data
import pandas as pd
from pathlib import Path
import logging
import nltk
import json
import importlib

# Reload Modules
importlib.reload(aux_vsm)
importlib.reload(aux_bm25)
importlib.reload(aux_semantics)
importlib.reload(aux_retrieval)

# For word processing
nltk.download('punkt_tab')

DocLike = Union[data.RetrievedDocument, Dict[str, Any]]

[nltk_data] Downloading package punkt_tab to /home/pablo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [15]:
# Basic enviroment setup

logging.basicConfig(
    level=logging.ERROR,
    format="%(asctime)s %(name)s [%(levelname)s] %(message)s",
    force=True
)
logger = logging.getLogger(__name__)

paths = {
    'word2vec': Path("../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin"),
    'idf_cache': Path("../02-data/03-VSM/idf_cache_path.pkl"),
    'word2vec_vsm_multivector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-1.pkl"),
    'word2vec_vsm_singlevector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-0.pkl"),
    'file': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/1cr2.txt"),
    'output_path': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/sentence_expansions.txt"),
    'pdf_folder': Path("../02-data/00-testing/"),
    'retriever': Path("../02-data/05-Retrieval/corpus_bm25"),
    'queries_1' : Path("../02-data/06-Evaluation/query_evaluation_1.json"),
    'queries_2' : Path("../02-data/06-Evaluation/query_evaluation_2.json"),
    'ranking_query_evaluation_1' : Path('../02-data/06-Evaluation/ranking_query_evaluation_1.pkl'), ## using queries_1, singlevector, QE, minmax
    'ranking_query_evaluation_2' : Path('../02-data/06-Evaluation/ranking_query_evaluation_2.pkl'), ## using queries_2
    'ranking_query_evaluation_3' : Path('../02-data/06-Evaluation/ranking_query_evaluation_3.pkl'), ## using queries_1, multivector, QE, minmax
    'ranking_query_evaluation_4' : Path('../02-data/06-Evaluation/ranking_query_evaluation_4.pkl'), ## using queries_1, multivector, no QE, minmax
    'ranking_query_evaluation_5' : Path('../02-data/06-Evaluation/ranking_query_evaluation_5.pkl'), ## using queries_1, multivector, QE, zscore
    'ranking_query_evaluation_6' : Path('../02-data/06-Evaluation/ranking_query_evaluation_6.pkl'), ## using queries_1, multivector, QE, minmax/zscore
    'QE_eval' : Path("../02-data/06-Evaluation/QE_eval.pkl"),           ## query1 , multivector , minmax/zscore , QE
    'no_QE_eval' : Path("../02-data/06-Evaluation/QE_eval.pkl")         ## query1 , multivector , minmax/zscore , no QE

}

use_multivector = True
use_expansion = False

norm_vsm = 'minmax'
norm_bm25 = 'zscore'

top_k = 100

save_file = str(paths['no_QE_eval'])

with open(str(paths['queries_1'])) as f:
    raw = json.load(f)

query_list_documents = [data.DocumentSection(**entry) for entry in raw]

--------------------------------
## VSM evaluation. Works both for testing multivecror and singlevector


In [16]:
importlib.reload(aux_vsm)
importlib.reload(aux_retrieval)

vsm_records = []

resources = aux_vsm.load_word2vec_resources(paths, use_multivector=use_multivector)

for section in query_list_documents:
    w2v_result: aux_vsm.Word2VecQueryResult = aux_vsm.run_word2vec_query_preloaded(
        resources,
        section.query,
        top_k=top_k,
        use_expansion=use_expansion
    )

    vsm_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": w2v_result
    })
    
#compute statistics for evaluation
vsm_statistics = aux_retrieval.compute_query_run_stats(vsm_records)
display(vsm_statistics['per_record'])
display(vsm_statistics['score_stats'])

#if you want to look at one specific record
vsm_data = pd.DataFrame(data = vsm_records[0]['result'].results.documents)
display(vsm_data)

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,2.0,0.690095,5,8,16
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1.0,0.449107,5,10,20
2,106765806,cable-ties-zip-ties,2.0,0.57698,5,7,14
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,15.0,0.698036,2,3,7
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,,,3,7,15
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,16.0,0.632503,4,7,15
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,3.0,0.663774,5,10,17
7,cds-13138-smt,speakers,,,1,2,3
8,LR43-DATASHEET,batteries-non-rechargable-primary,,,4,9,19
9,cds-25148,speakers,12.0,0.526992,4,7,14


Unnamed: 0,max,min,median
0,0.698036,0.449107,0.632503


Unnamed: 0,rank,doc_id,label,score
0,1,397-396,batteries-non-rechargable-primary,0.691751
1,2,1cr2,batteries-non-rechargable-primary,0.690095
2,3,357-303zb,batteries-non-rechargable-primary,0.689235
3,4,392-384z,batteries-non-rechargable-primary,0.687279
4,5,386-301,batteries-non-rechargable-primary,0.681582
...,...,...,...,...
95,96,lmr-400-llpx-coax-cables-datasheet,coaxial-cables-rf,0.638400
96,97,A675MF,batteries-non-rechargable-primary,0.638372
97,98,4062a60c-0db7-45f0-ba30-afabced83817,microphones,0.638330
98,99,toshiba-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.638195


----------------
### Small test on the effects of query expansion


In [17]:
for result in vsm_records:
    w2v_result = result['result']

    query_embedding = w2v_result.query_info
    
    print(query_embedding.query)
    print(query_embedding.expansions)
    print("------------------------")

this battery contains positive temperature coefficient element
[]
------------------------
buzzer with operating voltage 15 24 vdc
[]
------------------------
flexible tie body that safely  releasable and reusable
[]
------------------------
insulation is made of  plastic material and has  thickness of about 1 mm
[]
------------------------
busadapter supports protocol for profinet io
[]
------------------------
the system consists of  microphone  speaker and  sound card, the microphone is connected to the sound card which is connected to the speaker
[]
------------------------
resolution 203 dpi max print speed 102mm 4sec
[]
------------------------
all specifications measured at 535c humidity at 4585 under 86106kpa pressure
[]
------------------------
alkaline manganese batteries
[]
------------------------
speaker with operating temperature 20 55 
[]
------------------------


------
## BM25 evaluation


In [18]:
importlib.reload(aux_bm25)
importlib.reload(aux_retrieval)
logger = logging.getLogger(__name__)

bm25_records = []

for section in query_list_documents:
    bm25_result: aux_bm25.BM25QueryResult = aux_bm25.run_bm25_query(
        paths,
        section.query,
        top_k=top_k,
    )
    
    bm25_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": bm25_result
    })

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
#compute statistics for evaluation
bm25_statistics = aux_retrieval.compute_query_run_stats(bm25_records)
display(bm25_statistics['per_record'])
display(bm25_statistics['score_stats'])

#if you want to look at one specific record
bm25_data = pd.DataFrame(data = bm25_records[0]['result'].results.documents)
display(bm25_data)

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,2,5.740847,4,8,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1,6.938643,5,10,20
2,106765806,cable-ties-zip-ties,1,12.732177,5,8,15
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,1,6.910951,3,6,8
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,1,13.478672,5,9,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,2,13.271488,5,9,19
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,2,16.425482,5,10,20
7,cds-13138-smt,speakers,13,6.678161,0,2,11
8,LR43-DATASHEET,batteries-non-rechargable-primary,10,4.373617,5,10,20
9,cds-25148,speakers,14,2.79729,5,10,19


Unnamed: 0,max,min,median
0,16.425482,2.79729,6.924797


Unnamed: 0,rank,doc_id,label,score
0,1,123,batteries-non-rechargable-primary,5.875960
1,2,1cr2,batteries-non-rechargable-primary,5.740847
2,3,cms-402008-18sp,speakers,4.363099
3,4,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,4.257775
4,5,alkaline_appman,batteries-non-rechargable-primary,4.058401
...,...,...,...,...
95,96,8024d901-2c69-4d26-b0cf-6e1bdb74ff9b,microphones,1.964044
96,97,LN92-Industrial-0920,batteries-non-rechargable-primary,1.949623
97,98,ces-803118-28pm,speakers,1.941543
98,99,EM-6050-14-Feb-2019,microphones,1.902635


--------------------------
## RRF Evaluation


In [20]:
logger = logging.getLogger(__name__)

rrf_records = []

for i, section in enumerate(query_list_documents):
    
    df_bm25 = pd.DataFrame([d.__dict__ for d in bm25_records[i]["result"].results.documents])
    df_vsm  = pd.DataFrame([d.__dict__ for d in vsm_records[i]["result"].results.documents])

    rrf_result = aux_retrieval.rrf_from_dfs([df_bm25, df_vsm], rrf_k=60, top_k=100)

    rrf_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": rrf_result
    })
    
rrf_statistics = aux_retrieval.compute_query_run_stats(rrf_records)
display(rrf_statistics['per_record'])
display(rrf_statistics['score_stats'])

#if you want to look at one specific record
rrf_data = pd.DataFrame(data = rrf_records[0]['result'].results.documents)
display(rrf_data)
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,1,0.032258,4,7,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1,0.032787,5,10,20
2,106765806,cable-ties-zip-ties,1,0.032522,5,10,14
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,3,0.029727,3,4,10
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,32,0.016393,5,8,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,3,0.029287,5,10,18
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,1,0.032002,5,10,20
7,cds-13138-smt,speakers,67,0.013699,1,4,8
8,LR43-DATASHEET,batteries-non-rechargable-primary,64,0.014286,5,10,19
9,cds-25148,speakers,2,0.027402,5,9,17


Unnamed: 0,max,min,median
0,0.032787,0.013699,0.029507


Unnamed: 0,rank,doc_id,label,score
0,1,1cr2,batteries-non-rechargable-primary,0.032258
1,2,123,batteries-non-rechargable-primary,0.031099
2,3,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.028783
3,4,8373d9aa-424a-4cf8-add9-8655f7a6700f,microphones,0.025000
4,5,LR1130SPEC,batteries-non-rechargable-primary,0.024688
...,...,...,...,...
95,96,er14505j_2pt,batteries-non-rechargable-primary,0.008929
96,97,DELTA_IA-TC_DTB_OM_EN_20201202,controller-accesories,0.008850
97,98,8460b150-25ad-428a-ac26-0d84eff34af3,microphones,0.008772
98,99,er14335j-s,batteries-non-rechargable-primary,0.008772


----------------------
## Hybrid Search Evaluation


In [21]:
importlib.reload(aux_retrieval)
importlib.reload(data)
logger = logging.getLogger(__name__)

hybrid_records = []


for i, section in enumerate(query_list_documents):
    bm25_list = [d.__dict__ for d in bm25_records[i]['result'].results.documents]
    vsm_list  = [d.__dict__ for d in vsm_records[i]['result'].results.documents]
    
    hybrid_result:QueryResult = aux_retrieval.hybrid_search(
        bm25_results= bm25_list,
        vsm_results=  vsm_list,
        weight_bm25=  0.7,
        weight_vsm=   0.3,
        top_k=        100,
        norm_bm25=    norm_bm25,
        norm_vsm=     norm_vsm
    )

    hybrid_records.append({
        "doc":    section.doc,
        "label":  section.label,
        "query":  section.query,
        "result": hybrid_result
    })

In [22]:
hybrid_statistics = aux_retrieval.compute_query_run_stats(hybrid_records)
display(hybrid_statistics['per_record'])
display(hybrid_statistics['score_stats'])

#if you want to look at one specific record
hybrid_data = pd.DataFrame(data = hybrid_records[0]['result'].results.documents)
display(hybrid_data)
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,2,3.480051,4,8,15
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1,2.000634,5,10,20
2,106765806,cable-ties-zip-ties,1,5.165404,5,8,14
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,1,3.319255,3,6,8
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,1,4.736688,5,10,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,2,2.21274,5,9,19
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,2,3.002933,5,10,20
7,cds-13138-smt,speakers,15,0.965697,0,2,11
8,LR43-DATASHEET,batteries-non-rechargable-primary,11,1.11462,5,10,20
9,cds-25148,speakers,11,1.221994,5,10,19


Unnamed: 0,max,min,median
0,5.165404,0.965697,2.607836


Unnamed: 0,rank,doc_id,label,score
0,1,123,batteries-non-rechargable-primary,3.553676
1,2,1cr2,batteries-non-rechargable-primary,3.480051
2,3,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,1.865527
3,4,cms-402008-18sp,speakers,1.839457
4,5,alkaline_appman,batteries-non-rechargable-primary,1.490533
...,...,...,...,...
95,96,SCR1_3N-BTI,batteries-non-rechargable-primary,-0.000303
96,97,SCR17335A_719-BTI,batteries-non-rechargable-primary,-0.004783
97,98,Toshiba%20Tech%20Data%20CR1616%202022,batteries-non-rechargable-primary,-0.024660
98,99,69b20e1f-93e2-4608-8364-ab0a6588db4b,microphones,-0.031990


--------------------------
## Rerank evaluation ( BM25 - VSM )


In [23]:
importlib.reload(aux_retrieval)
importlib.reload(data)
logger = logging.getLogger(__name__)

rerank_records = []
top_k = 50
use_multivector = True

for section in query_list_documents:

    rerank_results:QueryResult = aux_retrieval.run_hybrid_query(
        paths = paths,
        query = section.query,
        top_k = top_k,
        use_multivector = use_multivector
    )

    rerank_records.append({
        "doc":    section.doc,
        "label":  section.label,
        "query":  section.query,
        "result": rerank_results
    })

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
rerank_statistics = aux_retrieval.compute_query_run_stats(rerank_records)
display(rerank_statistics['per_record'])
display(rerank_statistics['score_stats'])

#if you want to look at one specific record
rerank_data = pd.DataFrame(data = rerank_records[0]['result'].results.documents)
display(rerank_data[:5])
    

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,1,0.690095,4,9,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1,0.449107,5,10,20
2,106765806,cable-ties-zip-ties,2,0.57698,5,8,14
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,7,0.698036,1,4,9
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,22,0.0,4,9,17
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,11,0.632503,4,9,17
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,3,0.663774,5,10,19
7,cds-13138-smt,speakers,30,0.0,2,4,11
8,LR43-DATASHEET,batteries-non-rechargable-primary,38,0.0,5,10,20
9,cds-25148,speakers,5,0.526992,5,9,16


Unnamed: 0,max,min,median
0,0.698036,0.0,0.551986


Unnamed: 0,rank,doc_id,label,score
0,1,1cr2,batteries-non-rechargable-primary,0.690095
1,2,123,batteries-non-rechargable-primary,0.678843
2,3,8373d9aa-424a-4cf8-add9-8655f7a6700f,microphones,0.6739
3,4,l91,batteries-non-rechargable-primary,0.669383
4,5,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,0.669118


In [25]:
import pickle

# Save DataFrames
dfs = {
    'hybrid_statistics': hybrid_statistics['per_record'],
    'rrf_statistics': rrf_statistics['per_record'],
    'bm25_statistics': bm25_statistics['per_record'],
    'vsm_statistics': vsm_statistics['per_record'],
    'rerank_statistics': rerank_statistics['per_record']
}
with open(save_file, 'wb') as f:
    pickle.dump(dfs, f)

--------------
### Run all models and get the results back

In [26]:
import pickle

run = False

if run:
    
    statistics_results = aux_retrieval.run_all_models(query_list_documents, paths, use_expansion=use_expansion, use_multivector=use_multivector,)
    display(statistics_results['bm25_statistics'])
    
    with open(save_path), 'wb') as f:
        pickle.dump(statistics_results, f)

SyntaxError: unmatched ')' (2907718880.py, line 10)