In [14]:
import aux_document_retrieval_bm25 as aux_bm25
import aux_document_retrieval_vsm as aux_vsm
import aux_semantic_search as aux_semantics
import aux_retrieval_evaluation as aux_retrieval
import aux_document_retrieval_hybrid as aux_hybrid
import dataclass as data

import pandas as pd
from pathlib import Path
import logging
import nltk
import json
import importlib


importlib.reload(aux_vsm)
importlib.reload(aux_bm25)
importlib.reload(aux_semantics)
importlib.reload(aux_retrieval)
importlib.reload(data)
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /home/pablo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
## Basic enviroment setup

paths = {
    'word2vec': Path("../02-data/03-VSM/01-Word2Vec/word2vec-google-news-300.bin"),
    'idf_cache': Path("../02-data/03-VSM/idf_cache_path.pkl"),
    'word2vec_vsm_multivector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-1.pkl"),
    'word2vec_vsm_singlevector': Path("../02-data/03-VSM/01-Word2Vec/word2vec-4-50-4-150-0.pkl"),
    'file': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/1cr2.txt"),
    'output_path': Path("../02-data/00-testing/batteries-non-rechargable-primary/1cr2/sentence_expansions.txt"),
    'pdf_folder': Path("../02-data/00-testing/"),
    'retriever': Path("../02-data/05-Retrieval/corpus_bm25")
}

with open("../02-data/06-Evaluation/document_queries.json") as f:
    raw = json.load(f)

query_list_documents = [data.DocumentSection(**entry) for entry in raw]

In [None]:
summary_hybrid_df,results_hybrid = aux_retrieval.evaluate_hybrid_queries(
    paths=paths,
    documents=query_list_documents,
    top_k=200,
    weight_vsm=0.6,
    weight_bm25=0.4,
    norm_vsm='minmax',
    norm_bm25='minmax',
    use_expansion=True,
    use_multivector=False,
)

In [None]:
summary_hybrid_df

In [None]:
##make graphs pretty and whatnot

import matplotlib.pyplot as plt

# 1) Descriptive statistics
stats = summary_df[[
    'label_count_top5',
    'label_count_top10',
    'label_count_top20',
    'matched_rank',
    'matched_score'
]].describe()
print("Descriptive statistics:\n", stats)


# 2) Histograms for each label_count_*
for col in ['label_count_top5', 'label_count_top10', 'label_count_top20']:
    plt.figure()
    plt.hist(summary_df[col].dropna(), bins=range(0, summary_df[col].max() + 2))
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


# 3) Scatter plot: matched_rank vs. label_count_top20
#    (only for rows where matched_rank is not null)
valid = summary_df.dropna(subset=['matched_rank'])
plt.figure()
plt.scatter(valid['matched_rank'], valid['label_count_top20'])
plt.title('Matched Rank vs. Label Count Top20')
plt.xlabel('Matched Rank')
plt.ylabel('Label Count Top20')
plt.show()

--------------------------------

## VSM evaluation. Works both for testing multivecror and singlevector

In [47]:
importlib.reload(aux_vsm)
importlib.reload(aux_retrieval)

logging.basicConfig(
    level=logging.ERROR,
    format="%(asctime)s %(name)s [%(levelname)s] %(message)s",
    force=True
)
logger = logging.getLogger(__name__)

vsm_records = []
top_k = 100
use_expansion = True
use_multivector = False

resources = aux_vsm.load_word2vec_resources(paths, use_multivector=use_multivector)

for section in query_list_documents:
    w2v_result: aux_vsm.Word2VecQueryResult = aux_vsm.run_word2vec_query_preloaded(
        resources,
        section.query,
        top_k=top_k,
        use_expansion=use_expansion
    )

    vsm_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": w2v_result
    })
    
#compute statistics for evaluation
vsm_statistics = aux_retrieval.compute_query_run_stats(vsm_records)
display(vsm_statistics['per_record'])
display(vsm_statistics['score_stats'])

#if you want to look at one specific record
vsm_data = pd.DataFrame(data = vsm_records[0]['result'].results.documents)
display(vsm_data)

Unnamed: 0,doc,label,rank,score,label_count_top_5,label_count_top_10,label_count_top_20
0,1cr2,batteries-non-rechargable-primary,13.0,0.710978,3,7,14
1,cpi-2212-85pm,alarms-buzzers-and-sirens,89.0,0.540444,1,1,4
2,106765806,cable-ties-zip-ties,22.0,0.56538,4,9,15
3,a4cec9f14f3566555c97c0046b10048597120954,coaxial-cables-rf,,,1,4,5
4,1b3c3c99b9be83ca11f01be323700b027a38ccdd,controller-accesories,,,5,9,18
5,58f5bac3-3d38-43b1-b6d1-60bac2a5569a,microphones,,,0,0,1
6,96PR-102-UB3-M_datasheet20180225090037,printers-label-makers,4.0,0.674221,5,10,19
7,cds-13138-smt,speakers,,,0,0,1
8,LR43-DATASHEET,batteries-non-rechargable-primary,,,4,9,19
9,cds-25148,speakers,,,4,9,14


Unnamed: 0,max,min,median
0,0.710978,0.540444,0.6198


Unnamed: 0,rank,doc_id,score,doc_name,label
0,1,SR41-392-384-DATASHEET,0.728945,SR41-392-384-DATASHEET.txt,batteries-non-rechargable-primary
1,2,392-384z,0.726830,392-384z.txt,batteries-non-rechargable-primary
2,3,GT-0905A-19-Feb-2019,0.723888,GT-0905A-19-Feb-2019.txt,alarms-buzzers-and-sirens
3,4,maxell-alkaline-battery-002-2014,0.718862,maxell-alkaline-battery-002-2014.txt,batteries-non-rechargable-primary
4,5,MU064602-1,0.716878,MU064602-1.txt,microphones
...,...,...,...,...,...
95,96,cms-16093-078x-67,0.685307,cms-16093-078x-67.txt,speakers
96,97,cem-1212s,0.685244,cem-1212s.txt,alarms-buzzers-and-sirens
97,98,cms-160925-18sp-x8,0.685211,cms-160925-18sp-x8.txt,speakers
98,99,cds-15158-smt,0.685202,cds-15158-smt.txt,speakers


------

## BM25 evaluation

In [None]:
importlib.reload(aux_bm25)
importlib.reload(aux_retrieval)

logging.basicConfig(
    level=logging.ERROR,
    format="%(asctime)s %(name)s [%(levelname)s] %(message)s",
    force=True
)
logger = logging.getLogger(__name__)

top_k = 100
bm25_records = []

for section in query_list_documents:
    bm25_result: aux_bm25.BM25QueryResult = aux_bm25.run_bm25_query(
        paths,
        section.query,
        top_k=top_k,
    )

    bm25_records.append({
        "doc": section.doc,
        "label": section.label,
        "query": section.query,
        "result": bm25_result
    })
    
#compute statistics for evaluation
bm25_statistics = aux_retrieval.compute_query_run_stats(bm25_records)
display(bm25_statistics['per_record'])
display(bm25_statistics['score_stats'])

#if you want to look at one specific record
bm25_data = pd.DataFrame(data = bm25_records[0]['result'].results.documents)
display(bm25_data)

--------------------------

## RRF Evaluation

In [49]:
importlib.reload(aux_retrieval)

query_data = pd.DataFrame(data = records[0]['result'].results.documents)

fused_df = aux_retrieval.rrf_df(query_data,results_bm25['1cr2'])

In [50]:
fused_df

Unnamed: 0,vsm_rank,doc_id_x,vsm_score,doc_name,label_x,doc_id_y,label_y,bm25_score,bm25_rank,rrf_vsm,rrf_bm25,rrf_score
0,1.0,SR41-392-384-DATASHEET,0.728945,SR41-392-384-DATASHEET.txt,batteries-non-rechargable-primary,,,,1.0,0.016393,0.016393,0.032787
1,2.0,392-384z,0.726830,392-384z.txt,batteries-non-rechargable-primary,,,,1.0,0.016129,0.016393,0.032522
2,3.0,GT-0905A-19-Feb-2019,0.723888,GT-0905A-19-Feb-2019.txt,alarms-buzzers-and-sirens,,,,1.0,0.015873,0.016393,0.032266
3,4.0,maxell-alkaline-battery-002-2014,0.718862,maxell-alkaline-battery-002-2014.txt,batteries-non-rechargable-primary,,,,1.0,0.015625,0.016393,0.032018
4,5.0,MU064602-1,0.716878,MU064602-1.txt,microphones,,,,1.0,0.015385,0.016393,0.031778
...,...,...,...,...,...,...,...,...,...,...,...,...
195,101.0,,,Toshiba%20Tech%20Data%20LR44%202022,,161.0,batteries-non-rechargable-primary,2.062522,1.0,0.006211,0.016393,0.022605
196,101.0,,,Toshiba%20Tech%20Data%20CR2477%202022,,160.0,batteries-non-rechargable-primary,2.155258,1.0,0.006211,0.016393,0.022605
197,101.0,,,Toshiba%20Tech%20Data%20CR2016%202022,,156.0,batteries-non-rechargable-primary,2.467143,1.0,0.006211,0.016393,0.022605
198,101.0,,,Toshiba%20Tech%20Data%20CR1616%202022,,155.0,batteries-non-rechargable-primary,2.557758,1.0,0.006211,0.016393,0.022605


In [53]:
results_bm25['1cr2']

Unnamed: 0_level_0,doc_id,doc_name,label,score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,37,123,batteries-non-rechargable-primary,5.875960
2,38,1cr2,batteries-non-rechargable-primary,5.740847
3,509,cms-402008-18sp,speakers,4.363099
4,105,maxell-alkaline-battery-002-2014,batteries-non-rechargable-primary,4.257775
5,60,alkaline_appman,batteries-non-rechargable-primary,4.058401
...,...,...,...,...
96,279,8024d901-2c69-4d26-b0cf-6e1bdb74ff9b,microphones,1.964044
97,185,LN92-Industrial-0920,batteries-non-rechargable-primary,1.949623
98,556,ces-803118-28pm,speakers,1.941543
99,350,EM-6050-14-Feb-2019,microphones,1.902635


In [54]:
query_data

Unnamed: 0,rank,doc_id,score,doc_name,label
0,1,SR41-392-384-DATASHEET,0.728945,SR41-392-384-DATASHEET.txt,batteries-non-rechargable-primary
1,2,392-384z,0.726830,392-384z.txt,batteries-non-rechargable-primary
2,3,GT-0905A-19-Feb-2019,0.723888,GT-0905A-19-Feb-2019.txt,alarms-buzzers-and-sirens
3,4,maxell-alkaline-battery-002-2014,0.718862,maxell-alkaline-battery-002-2014.txt,batteries-non-rechargable-primary
4,5,MU064602-1,0.716878,MU064602-1.txt,microphones
...,...,...,...,...,...
95,96,cms-16093-078x-67,0.685307,cms-16093-078x-67.txt,speakers
96,97,cem-1212s,0.685244,cem-1212s.txt,alarms-buzzers-and-sirens
97,98,cms-160925-18sp-x8,0.685211,cms-160925-18sp-x8.txt,speakers
98,99,cds-15158-smt,0.685202,cds-15158-smt.txt,speakers
