In [54]:
import aux_document_retrieval_bm25 as aux_bm25
import aux_document_retrieval_vsm as aux_vsm
import aux_semantic_search as aux_semantics
import aux_retrieval_evaluation as aux_retrieval

import aux_document_retrieval_hybrid as aux_hybrid
import pandas as pd
from pathlib import Path
import logging
import nltk
from dataclasses import dataclass
import json
import importlib


importlib.reload(aux_vsm)
importlib.reload(aux_bm25)
importlib.reload(aux_semantics)
importlib.reload(aux_retrieval)

nltk.download('punkt_tab')
logger = logging.getLogger(__name__)

def setup_logging():
    logging.basicConfig(
        level=logging.ERROR,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    )

setup_logging()

[nltk_data] Downloading package punkt_tab to /home/pablo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [55]:
## Basic enviroment setup

paths = {
    'word2vec': Path("../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin"),
    'idf_cache': Path("../02-data/03-VSM/idf_cache_path.pkl"),
    'word2vec_vsm_multivector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-1.pkl"),
    'word2vec_vsm_singlevector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-0.pkl"),
    'file': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/1cr2.txt"),
    'output_path': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/sentence_expansions.txt"),
    'pdf_folder': Path("../02-data/00-testing/"),
    'retriever': Path("../02-data/05-Retrieval/corpus_bm25")
}

@dataclass
class DocumentSection:
    doc: str
    label: str
    query: str
    
with open("../02-data/06-Evaluation/document_queries.json") as f:
    raw = json.load(f)

query_list_documents = [DocumentSection(**entry) for entry in raw]

In [None]:
## VSM evaluation

summary_df, results_map = aux_retrieval.evaluate_vsm_queries(
    paths,
    query_list_documents,
    top_k=20,
    use_multivector=True,
    use_expansion=True
)

display(summary_df)
display(results_map['1cr2'])

2025-05-25 14:41:20,612 - root - INFO - Loading pre-trained Google News Word2Vec model...
2025-05-25 14:41:20,614 - gensim.models.keyedvectors - INFO - loading projection weights from ../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin
2025-05-25 14:41:50,818 - gensim.utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2025-05-25T14:41:50.817497', 'gensim': '4.3.2', 'python': '3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-138-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}
2025-05-25 14:41:50,835 - root - INFO - Loading cached IDF dictionary from ../02-data/03-VSM/idf_cache_path.pkl...
2025-05-25 14:41:51,070 - root - INFO - Testing query: 'this battery contains positive temperature coefficient element' at top_k=20


In [None]:
##make graphs pretty and whatnot

import matplotlib.pyplot as plt

# 1) Descriptive statistics
stats = summary_df[[
    'label_count_top5',
    'label_count_top10',
    'label_count_top20',
    'matched_rank',
    'matched_score'
]].describe()
print("Descriptive statistics:\n", stats)


# 2) Histograms for each label_count_*
for col in ['label_count_top5', 'label_count_top10', 'label_count_top20']:
    plt.figure()
    plt.hist(summary_df[col].dropna(), bins=range(0, summary_df[col].max() + 2))
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


# 3) Scatter plot: matched_rank vs. label_count_top20
#    (only for rows where matched_rank is not null)
valid = summary_df.dropna(subset=['matched_rank'])
plt.figure()
plt.scatter(valid['matched_rank'], valid['label_count_top20'])
plt.title('Matched Rank vs. Label Count Top20')
plt.xlabel('Matched Rank')
plt.ylabel('Label Count Top20')
plt.show()