In [None]:
import sys
import os
import itertools

import pandas as pd

sys.path.append(os.path.abspath('..'))
from src.env import PACKAGE_DIR
from src.utils.call_api.schema import EmbeddingEngineEnum
from src.retrievers.models.bm25_retriever import KeywordRetriever
from src.retrievers.models.vector_retriever import VectorRetriever
from src.retrievers.models.ensemble_retriever import EnsembleRetriever
from scripts.evaluate_retriver import evaluate

pd.set_option("display.max_colwidth", 500)
pd.set_option('display.max_columns', None)
query_df = pd.read_csv(PACKAGE_DIR/'data/raw/valid_sets/query_ans_txt.csv')

## 定量評価

In [None]:
chunksize = [250, 300, 350, 400, 450, 500]
overlap = [0.2, 0.3, 0.4, 0.5]
comb_list = list(itertools.product(chunksize, overlap))

vec_norm_result = {'chunksize': [], 'overlap': [], 'top5': [], 'top10': [], 'top25': [], 'top50': []}
vec_aug_result = {'chunksize': [], 'overlap': [], 'top5': [], 'top10': [], 'top25': [], 'top50': []}
key_norm_result = {'chunksize': [], 'overlap': [], 'top5': [], 'top10': [], 'top25': [], 'top50': []}
key_aug_result = {'chunksize': [], 'overlap': [], 'top5': [], 'top10': [], 'top25': [], 'top50': []}

for chunksize, overlap in comb_list:
    index_dir = PACKAGE_DIR.joinpath(f'data/database/valid_sets/chartext_chunk{chunksize}_lap{overlap}')
    base_chunked_df = pd.read_csv(index_dir/'openai_large.csv')
    vec_norm_result['chunksize'].append(chunksize)
    vec_aug_result['chunksize'].append(chunksize)
    key_norm_result['chunksize'].append(chunksize)
    key_aug_result['chunksize'].append(chunksize)

    vec_norm_result['overlap'].append(overlap)
    vec_aug_result['overlap'].append(overlap)
    key_norm_result['overlap'].append(overlap)
    key_aug_result['overlap'].append(overlap)


    # normal index検索
    normal_index = base_chunked_df.drop_duplicates(subset='text')
    vecretriever = VectorRetriever(normal_index, model=EmbeddingEngineEnum.Large)
    keyretriever = KeywordRetriever(normal_index)

    # augmented index検索
    aug_vecretriever = VectorRetriever(base_chunked_df, emb_column_name='augmented_embedding', target_column_name='augmented_text', model=EmbeddingEngineEnum.Large)
    aug_keyretriever = KeywordRetriever(base_chunked_df, tokenized_column_name='augmented_tokenized_text', target_column_name='augmented_text')

    for topk in [5, 10, 25, 50]:
        vec_norm_result[f'top{topk}'].append(evaluate(vecretriever, topk, query_df))
        vec_aug_result[f'top{topk}'].append(evaluate(aug_vecretriever, topk, query_df, evidence_column='text', require_columns=['chunk_id', 'text']))
        key_norm_result[f'top{topk}'].append(evaluate(keyretriever, topk, query_df))
        key_aug_result[f'top{topk}'].append(evaluate(aug_keyretriever, topk, query_df, evidence_column='text', require_columns=['chunk_id', 'text']))

## 定性評価

In [None]:
chunksize = 400
overlap = 0.3
index_df = pd.read_csv(PACKAGE_DIR.joinpath(f'data/database/valid_sets/chartext_chunk{chunksize}_lap{overlap}/openai_large.csv'))

In [None]:
normal_index = index_df.drop_duplicates(subset='text')
keyretriever = KeywordRetriever(normal_index)
vecretriever = VectorRetriever(normal_index, model=EmbeddingEngineEnum.Large)
aug_keyretriever = KeywordRetriever(index_df, tokenized_column_name='augmented_tokenized_text', target_column_name='augmented_text')
aug_vecretriever = VectorRetriever(index_df, emb_column_name='augmented_embedding', target_column_name='augmented_text', model=EmbeddingEngineEnum.Large)

# ens retriever
ensretriever = EnsembleRetriever(vecretriever, keyretriever)
aug_ensretriever = EnsembleRetriever(aug_vecretriever, aug_keyretriever)

In [None]:
evaluate(vecretriever, 5, query_df, return_miss=True)

In [None]:
evaluate(aug_vecretriever, 5, query_df, return_miss=True, require_columns=['text', 'chunk_id'], evidence_column='text')

In [None]:
for vec in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    score = evaluate(ensretriever, 10, query_df, weights=[vec, 1-vec], ensemble_method='rrf', rank_impact_mitigator=0)
    print(f'score is {score} when vec: {vec}, key: {1-vec}')

In [None]:
for vec in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    score = evaluate(aug_ensretriever, 10, query_df, weights=[vec, 1-vec], ensemble_method='rrf', rank_impact_mitigator=0, require_columns=['text', 'chunk_id'], evidence_column='text')
    print(f'score is {score} when vec: {vec}, key: {1-vec}')

In [None]:
q_id = 0
print(query_df.loc[q_id, ['problem', 'evidence']])
aug_ensretriever.retrieve(query_df.loc[q_id, 'problem'], top_k=10, weights=[vec, 1-vec], ensemble_method='rrf', rank_impact_mitigator=0, require_columns=['text', 'chunk_id'])

In [None]:
vecretriever.retrieve('主人公の吉田の患部は主にどこですか', top_k=10, require_columns=['chunk_id', 'text'])