In [38]:
import aux_document_retrieval_bm25 as aux_bm25
import aux_document_retrieval_vsm as aux_vsm
import aux_semantic_search as aux_semantics
import aux_retrieval_evaluation as aux_retrieval
import aux_document_retrieval_hybrid as aux_hybrid

import pandas as pd
from pathlib import Path
import logging
import nltk
from dataclasses import dataclass
import json
import importlib


importlib.reload(aux_vsm)
importlib.reload(aux_bm25)
importlib.reload(aux_semantics)
importlib.reload(aux_retrieval)

nltk.download('punkt_tab')
logger = logging.getLogger(__name__)

def setup_logging():
    logging.basicConfig(
        level=logging.ERROR,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )

setup_logging()

[nltk_data] Downloading package punkt_tab to /home/pablo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [39]:
## Basic enviroment setup

paths = {
    'word2vec': Path("../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin"),
    'idf_cache': Path("../02-data/03-VSM/idf_cache_path.pkl"),
    'word2vec_vsm_multivector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-1.pkl"),
    'word2vec_vsm_singlevector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-0.pkl"),
    'file': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/1cr2.txt"),
    'output_path': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/sentence_expansions.txt"),
    'pdf_folder': Path("../02-data/00-testing/"),
    'retriever': Path("../02-data/05-Retrieval/corpus_bm25")
}

@dataclass
class DocumentSection:
    doc: str
    label: str
    query: str
    
with open("../02-data/06-Evaluation/document_queries.json") as f:
    raw = json.load(f)

query_list_documents = [DocumentSection(**entry) for entry in raw]

In [40]:
## VSM evaluation

summary_vsm_df, results_map = aux_retrieval.evaluate_vsm_queries(
    paths,
    query_list_documents,
    top_k=20,
    use_multivector=True,
    use_expansion=True
)

display(summary_vsm_df)
display(results_map['1cr2'][:5])

2025-05-25 18:02:07,448 - root - INFO - Loading pre-trained Google News Word2Vec model...
2025-05-25 18:02:07,450 - gensim.models.keyedvectors - INFO - loading projection weights from ../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin
2025-05-25 18:02:37,002 - gensim.utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2025-05-25T18:02:37.002070', 'gensim': '4.3.2', 'python': '3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-138-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}
2025-05-25 18:02:37,002 - root - INFO - Loading cached IDF dictionary from ../02-data/03-VSM/idf_cache_path.pkl...
2025-05-25 18:02:37,110 - root - INFO - Testing query: 'this battery contains positive temperature coefficient element' at top_k=20
2025-05-25 18:02:41,470 - a

Unnamed: 0,doc,label,label_count_top5,label_count_top10,label_count_top20,matched_rank,matched_score
0,1cr2,batteries-non-rechargable-primary,3,6,12,19.0,0.710978
1,cpi-2212-85pm,alarms-buzzers-and-sirens,1,2,3,,
2,106765806,cable-ties-zip-ties,4,7,13,11.0,0.584506
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,1,5,7,,
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,5,9,12,,
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,0,0,2,,
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,5,10,19,4.0,0.674221
7,cds-13138-smt,speakers,0,0,1,,
8,LR43-DATASHEET,batteries-non-rechargable-primary,4,9,19,,
9,cds-25148,speakers,2,5,13,,


Unnamed: 0_level_0,doc_id,score,label
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,d9e2b7a7-5099-4612-b90a-796f8295ffd5,0.738324,microphones
2,8373d9aa-424a-4cf8-add9-8655f7a6700f,0.732933,microphones
3,SR41-392-384-DATASHEET,0.728945,batteries-non-rechargable-primary
4,392-384z,0.72683,batteries-non-rechargable-primary
5,pana-cr1220-ds,0.724038,batteries-non-rechargable-primary


In [None]:
## BM25 evaluation

logging.disable(logging.CRITICAL) ##im just tired ok? this is it

summary_bm25_df, results_bm25 = aux_retrieval.evaluate_queries(
    paths=paths,
    documents=query_list_documents,
    top_k=20,
    method="bm25"
)
logging.disable(logging.NOTSET)

In [44]:
display(summary_bm25_df)
display(results_bm25['1cr2'][:5])

Unnamed: 0,doc,label,label_count_top5,label_count_top10,label_count_top20,matched_rank,matched_score
0,1cr2,batteries-non-rechargable-primary,4,8,14,2.0,5.740847
1,cpi-2212-85pm,alarms-buzzers-and-sirens,5,10,20,1.0,6.938643
2,106765806,cable-ties-zip-ties,5,8,15,1.0,12.732177
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,3,6,8,1.0,6.910951
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,5,9,17,1.0,13.478672
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,0,0,0,,
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,5,10,20,2.0,16.425482
7,cds-13138-smt,speakers,1,1,2,2.0,6.421796
8,LR43-DATASHEET,batteries-non-rechargable-primary,5,10,20,10.0,4.373617
9,cds-25148,speakers,5,10,19,14.0,2.79729


Unnamed: 0_level_0,doc_id,doc_name,label,score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,37,123,batteries-non-rechargable-primary,5.87596
2,38,1cr2,batteries-non-rechargable-primary,5.740847
3,509,cms-402008-18sp,speakers,4.363099
4,105,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,4.257775
5,60,alkaline_appman,batteries-non-rechargable-primary,4.058401


In [None]:
##make graphs pretty and whatnot

import matplotlib.pyplot as plt

# 1) Descriptive statistics
stats = summary_df[[
    'label_count_top5',
    'label_count_top10',
    'label_count_top20',
    'matched_rank',
    'matched_score'
]].describe()
print("Descriptive statistics:\n", stats)


# 2) Histograms for each label_count_*
for col in ['label_count_top5', 'label_count_top10', 'label_count_top20']:
    plt.figure()
    plt.hist(summary_df[col].dropna(), bins=range(0, summary_df[col].max() + 2))
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


# 3) Scatter plot: matched_rank vs. label_count_top20
#    (only for rows where matched_rank is not null)
valid = summary_df.dropna(subset=['matched_rank'])
plt.figure()
plt.scatter(valid['matched_rank'], valid['label_count_top20'])
plt.title('Matched Rank vs. Label Count Top20')
plt.xlabel('Matched Rank')
plt.ylabel('Label Count Top20')
plt.show()