# Imports and preparation

In [4]:
import os

In [5]:
import numpy as np

In [6]:
import pandas as pd

In [7]:
from tqdm import tqdm

In [8]:
import time, gc

In [9]:
from itertools import product


In [10]:
from util import util_elastic

OpenAI tiktoken module is not available for Python < 3.8,Linux ARM64 and AARCH64. Falling back to GPT2TokenizerFast.


In [11]:
from util import util_search

In [12]:
# Para ter repetibilidade nos resultados
random_state = 1

# Tratar valores infinitos (+ e -) como np.NaN
pd.options.mode.use_inf_as_na = True

# IMPORTANTE para tornar figuras interativas
# %matplotlib notebook

# Tamanho padrão das figuras
figsize=(10,6)

pd.set_option('display.max_row', 1000)

pd.set_option('display.max_columns', 50)

pd.set_option('display.column_space', 40)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', 200)


In [13]:
PATH_QUERY = '../data/juris_tcu_index/query.csv'
PATH_QREL =  '../data/juris_tcu_index/qrel.csv'

In [14]:
PATH_SEARCH_EXPERIMENT =  '../data/search/juris_tcu_index/search_experiment_juris_tcu_index.csv'
PATH_SEARCH_RESULT =  '../data/search/juris_tcu_index/search_experiment_result_juris_tcu_index.csv'

# Data load

## Query data load

In [15]:
df_query = pd.read_csv(PATH_QUERY)

In [16]:
df_query.shape

(16022, 11)

In [17]:
df_query.head()

Unnamed: 0,ID,TEXT,REFERENCE_LIST,PARADIGMATIC,AREA_NAME,AREA_ID_DESCRIPTOR,NORMATIVE_PROCESS_TYPE,NORMATIVE_IDENTIFICATION,NORMATIVE_DATE,NORMATIVE_AUTHOR_TYPE,NORMATIVE_AUTHOR_NAME
0,34899,"A transferência de documentos da entidade para local impróprio ao armazenamento, causando a perd...","Lei Ordinária 8.443/1992, art. 58, inciso II",,Responsabilidade,775,REPRESENTAÇÃO,Acórdão 2669/2012 - Plenário,2012-10-03,RELATOR,JOSÉ JORGE
1,30271,"A contratação de médicos e profissionais da área de saúde, como colaboradores eventuais, com pag...",,,Pessoal,1131,REPRESENTAÇÃO,Acórdão 2669/2012 - Plenário,2012-10-03,RELATOR,JOSÉ JORGE
2,26574,"Para que seja conhecido o recurso de revisão, não basta apenas que se apresente documento ainda ...",,,Direito processual,5288,TOMADA DE CONTAS,Acórdão 514/2013 - Plenário,2013-03-13,RELATOR,ANA ARRAES
3,17902,"A contratação de serviços comuns de engenharia que possam ser objetivamente definidos em edital,...",,,Licitação,932,RELATÓRIO DE LEVANTAMENTO,Acórdão 3144/2012 - Plenário,2012-11-21,RELATOR,ANA ARRAES
4,26089,"A Fundação Banco do Brasil, por receber recursos da União, deve observar, quando do repasse de r...",,,Competência do TCU,5095,TOMADA DE CONTAS ESPECIAL,Acórdão 2071/2013 - Plenário,2013-08-07,RELATOR,JOSÉ JORGE


## Qrel data load

In [18]:
df_qrel = pd.read_csv(PATH_QREL)

In [19]:
df_qrel.shape

(94653, 3)

In [20]:
df_qrel.head()

Unnamed: 0,ID_QUERY,ID_DOCTO,TYPE
0,158,15147,INDEXACAO_EXTRA
1,37,15147,INDEXACAO_EXTRA
2,178,15147,INDEXACAO_EXTRA
3,14564,15147,INDEXACAO_EXTRA
4,9219,15147,INDEXACAO_EXTRA


In [21]:
df_qrel.shape[0]

94653

In [22]:
df_search_data = df_query.merge(df_qrel, how='left', left_on='ID', right_on='ID_QUERY').drop('ID_QUERY', axis=1)

In [23]:
df_search_data.shape[0]


94653

In [24]:
# Agrupar os dados pelo ID e criar a coluna 'RELEVANCE_LIST' com a lista de tuplas
# df_new = df_search_data.groupby('ID').apply(lambda x: list(zip(x['ID_DOCTO'], x['TYPE']))).reset_index(name='RELEVANCE_LIST')
# df_new['RELEVANCE_LIST'] = df_new['RELEVANCE_LIST'].apply(lambda x: sorted(x, key=lambda tup: ('AREA', 'TEMA', 'SUBTEMA', 'INDEXACAO_EXTRA').index(tup[1])))
df_new = df_search_data.groupby('ID').apply(lambda x: dict(zip(x['ID_DOCTO'], x['TYPE']))).reset_index(name='RELEVANCE_DICT')


In [25]:
df_new.head()

Unnamed: 0,ID,RELEVANCE_DICT
0,5,"{1298: 'INDEXACAO_EXTRA', 15340: 'INDEXACAO_EXTRA', 15961: 'INDEXACAO_EXTRA', 1131: 'AREA', 5106..."
1,6,"{207: 'INDEXACAO_EXTRA', 15961: 'INDEXACAO_EXTRA', 1131: 'AREA', 5106: 'SUBTEMA', 1113: 'TEMA'}"
2,7,"{108: 'INDEXACAO_EXTRA', 1526: 'TEMA', 1727: 'SUBTEMA', 223: 'INDEXACAO_EXTRA', 650: 'INDEXACAO_..."
3,8,"{887: 'SUBTEMA', 92: 'TEMA', 1131: 'AREA', 1100820: 'INDEXACAO_EXTRA'}"
4,9,"{1215: 'INDEXACAO_EXTRA', 14715: 'SUBTEMA', 46: 'TEMA', 1480: 'INDEXACAO_EXTRA', 5095: 'AREA'}"


In [26]:
# Unir o resultado com as demais colunas únicas
df_new = pd.merge(df_new, df_search_data.drop_duplicates('ID'), on='ID', how='left')

In [27]:

# Selecionar as colunas desejadas
df_search_data = df_new[['ID', 'TEXT', 'REFERENCE_LIST', 'PARADIGMATIC', 'AREA_NAME', 'AREA_ID_DESCRIPTOR', 'NORMATIVE_PROCESS_TYPE', 'NORMATIVE_IDENTIFICATION', 'NORMATIVE_DATE', 'NORMATIVE_AUTHOR_TYPE', 'NORMATIVE_AUTHOR_NAME', 'RELEVANCE_DICT']]


In [28]:
df_search_data.shape

(16022, 12)

In [29]:
df_search_data = df_search_data.reset_index()

In [30]:
df_search_data.head()

Unnamed: 0,index,ID,TEXT,REFERENCE_LIST,PARADIGMATIC,AREA_NAME,AREA_ID_DESCRIPTOR,NORMATIVE_PROCESS_TYPE,NORMATIVE_IDENTIFICATION,NORMATIVE_DATE,NORMATIVE_AUTHOR_TYPE,NORMATIVE_AUTHOR_NAME,RELEVANCE_DICT
0,0,5,"SÚMULA TCU 1: Não se compreendem como vencimento, para efeito de concessão da pensão especial co...","Lei Ordinária 3738/1960 || Lei Ordinária 1711/1952, art. 184",SUMULA,Pessoal,1131,,Ata 88/1973 - Plenário,1973-12-04,RELATOR,OCTÁVIO GALLOTTI,"{1298: 'INDEXACAO_EXTRA', 15340: 'INDEXACAO_EXTRA', 15961: 'INDEXACAO_EXTRA', 1131: 'AREA', 5106..."
1,1,6,"SÚMULA TCU 2: Configura-se como vencimento, para efeito da concessão da pensão especial com fund...",Lei Ordinária 3738/1960,SUMULA,Pessoal,1131,,Ata 88/1973 - Plenário,1973-12-04,RELATOR,OCTÁVIO GALLOTTI,"{207: 'INDEXACAO_EXTRA', 15961: 'INDEXACAO_EXTRA', 1131: 'AREA', 5106: 'SUBTEMA', 1113: 'TEMA'}"
2,2,7,SÚMULA TCU 3: O arquivamento é a solução indicada para as hipóteses em que as contas de responsá...,,SUMULA,Direito processual,5288,,Ata 88/1973 - Plenário,1973-12-04,RELATOR,OCTÁVIO GALLOTTI,"{108: 'INDEXACAO_EXTRA', 1526: 'TEMA', 1727: 'SUBTEMA', 223: 'INDEXACAO_EXTRA', 650: 'INDEXACAO_..."
3,3,8,"SÚMULA TCU 4: A reclassificação de cargos não aproveita ao servidor aposentado, a menos que lei ...",,SUMULA,Pessoal,1131,,Ata 88/1973 - Plenário,1973-12-04,RELATOR,OCTÁVIO GALLOTTI,"{887: 'SUBTEMA', 92: 'TEMA', 1131: 'AREA', 1100820: 'INDEXACAO_EXTRA'}"
4,4,9,"SÚMULA TCU 5 (REVOGADA): As sociedades de economia mista, salvo disposição expressa em lei, não ...",,SUMULA,Competência do TCU,5095,,AC 2082/2007-PL,2007-10-03,RELATOR,UBIRATAN AGUIAR,"{1215: 'INDEXACAO_EXTRA', 14715: 'SUBTEMA', 46: 'TEMA', 1480: 'INDEXACAO_EXTRA', 5095: 'AREA'}"


In [31]:
del df_new

# Create reference to index (elastic search)

Documentation https://docs.haystack.deepset.ai/docs
https://haystack.deepset.ai/tutorials/09_dpr_training 


In [32]:
index_dict = util_elastic.return_indexes('indir', parm_print=True)

Index: indir_juris_tcu_index
{'health': 'yellow', 'status': 'open', 'index': 'indir_juris_tcu_index', 'uuid': 'XqjmOmuaQxqmmxmdE65Q2Q', 'pri': '1', 'rep': '1', 'docs.count': '13252', 'docs.deleted': '13252', 'store.size': '289.2mb', 'pri.store.size': '289.2mb'}



In [33]:
index_dict

{'indir_juris_tcu_index': {'health': 'yellow',
  'status': 'open',
  'index': 'indir_juris_tcu_index',
  'uuid': 'XqjmOmuaQxqmmxmdE65Q2Q',
  'pri': '1',
  'rep': '1',
  'docs.count': '13252',
  'docs.deleted': '13252',
  'store.size': '289.2mb',
  'pri.store.size': '289.2mb'}}

In [34]:
INDEX_NAME = 'indir_juris_tcu_index'

In [35]:
index = util_elastic.return_index(parm_index_name=INDEX_NAME)


Qtd de documentos 13252

Qtd de embeddings 13252

Documento.id=1: <Document: id=1, content='O termo é "Abandono de cargo".
Abandono de cargo tem definição: "Configura abandono de cargo a ausên...'>


# Pipelines creation

In [36]:
pipes = []

In [37]:
# Test query
# parm_query = "trata-se de uma denúncia contra o prefeito de Manhuaçu por não haver pago os funcionários da área de limpeza urbana"
parm_query = "A transferência de documentos da entidade para local impróprio ao armazenamento, causando a perda de informações ou inviabilizando seu manuseio, de forma a impedir a atuação do TCU, é causa de responsabilização do gestor que a ordenou."

In [38]:
nome_modelo_monot5_3b = 'unicamp-dl/mt5-3B-mmarco-en-pt'
# "A mono-ptT5 reranker model (850 mb) pretrained in the BrWac corpus, finetuned for 100k steps on Portuguese translated version of MS MARCO passage dataset. The portuguese dataset was translated using Google Translate.")
nome_caminho_modelo_monot5_3b = "/home/borela/fontes/relevar-busca/modelo/" + nome_modelo_monot5_3b
assert os.path.exists(nome_caminho_modelo_monot5_3b), f"Path para {nome_caminho_modelo_monot5_3b} não existe!"

In [39]:
nome_modelo_ranking_minilm = 'unicamp-dl/mMiniLM-L6-v2-pt-v2'
nome_caminho_modelo_minilm = "/home/borela/fontes/relevar-busca/modelo/" + nome_modelo_ranking_minilm
assert os.path.exists(nome_caminho_modelo_minilm), f"Path para {nome_caminho_modelo_minilm} não existe!"

In [40]:
nome_modelo_embedding_model_sts = "rufimelo/Legal-BERTimbau-sts-large-ma-v3"
nome_caminho_modelo_sts = "/home/borela/fontes/relevar-busca/modelo/" + nome_modelo_embedding_model_sts
assert os.path.exists(nome_caminho_modelo_sts), f"Path para {nome_caminho_modelo_sts} não existe!"


In [41]:
raise Exception ('Stop execution - create only desired pipelines in code below ')

Exception: Stop execution - create only desired pipelines in code below 

## First stage = BM25

### pipe_bm25_ranker_monot5_3b

In [None]:
pipe_bm25_ranker_monot5_3b = util_search.return_pipeline_bm25_reranker(index, 'MONOT5', nome_caminho_modelo_monot5_3b, parm_limit_query_size=350)

In [None]:
pipes.append({'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b',
              'PIPE_OBJECT': pipe_bm25_ranker_monot5_3b,
              'RETRIEVER_TYPE': 'bm25',  # or 'sts'
              'RETRIEVER_MODEL_NAME': '', # or nome_modelo_embedding_model_sts_rufimelo,
              'RANKER_MODEL_NAME': nome_modelo_monot5_3b})

In [None]:
doctos_retornados_ranker = pipe_bm25_ranker_monot5_3b.run(query=parm_query)
util_search.detail_document_found(doctos_retornados_ranker)

Parâmetros usados: {}
Consulta: A transferência de documentos da entidade para local impróprio ao armazenamento, causando a perda de informações ou inviabilizando seu manuseio, de forma a impedir a atuação do TCU, é causa de responsabilização do gestor que a ordenou.
Qtd documentos retornados: 10
Primeiro docto:
<Document: id=1104189, content='O termo é "Processo administrativo de responsabilização". Processo administrativo de responsabilizaç...'>

Último (10):
<Document: id=1102005, content='O termo é "Plano de contratações".
Plano de contratações tem definição: "É o documento no qual a org...'>
Seguem os nomes dos termos recuperados em ordem de score
0 : ['Processo administrativo de responsabilização', -1.0603076219558716]
1 : ['Competência do TCU', -1.4214444160461426]
2 : ['Evidência documental', -1.9571717977523804]
3 : ['Ofício de requisição', -2.266594648361206]
4 : ['Julgamento de contas', -2.5948143005371094]
5 : ['Nexo de causalidade', -3.0983963012695312]
6 : ['Teletrabalho'

### pipe_bm25_ranker_minilm

In [None]:
pipe_bm25_ranker_minilm = util_search.return_pipeline_bm25_reranker(index, 'MINILM', nome_caminho_modelo_minilm, parm_limit_query_size=350)

In [None]:
pipes.append({'PIPE_NAME': 'pipe_bm25_ranker_minilm',
              'PIPE_OBJECT': pipe_bm25_ranker_minilm,
              'RETRIEVER_TYPE': 'bm25',  # or 'sts'
              'RETRIEVER_MODEL_NAME': '', # or nome_modelo_embedding_model_sts_rufimelo,
              'RANKER_MODEL_NAME': nome_modelo_ranking_minilm})

In [None]:
doctos_retornados_ranker = pipe_bm25_ranker_minilm.run(query=parm_query)
util_search.detail_document_found(doctos_retornados_ranker)

Parâmetros usados: {}
Consulta: A transferência de documentos da entidade para local impróprio ao armazenamento, causando a perda de informações ou inviabilizando seu manuseio, de forma a impedir a atuação do TCU, é causa de responsabilização do gestor que a ordenou.
Qtd documentos retornados: 10
Primeiro docto:
<Document: id=5095, content='O termo é "Competência do TCU". Competência do TCU tem definição: "São as seguintes as competências ...'>

Último (10):
<Document: id=4495, content='O termo é "Nexo de causalidade".
Nexo de causalidade tem definição: "Elemento que evidencia se a con...'>
Seguem os nomes dos termos recuperados em ordem de score
0 : ['Competência do TCU', 0.0014646538766101003]
1 : ['Processo administrativo de responsabilização', 0.0004114470211789012]
2 : ['Evidência documental', 0.00033812460605986416]
3 : ['Plano de contratações', 0.0002457168884575367]
4 : ['Julgamento de contas', 0.00022130725847091526]
5 : ['Teletrabalho', 0.00016929447883740067]
6 : ['Ofício de

## First stage = Sentence Similarity

### pipe_sts_ranker_minilm

In [None]:
pipe_sts_ranker_minilm = util_search.return_pipeline_sts_reranker(index, 'MINILM', parm_path_model_ranker=nome_caminho_modelo_minilm, parm_path_model_sts=nome_caminho_modelo_sts, parm_limit_query_size=350)

In [None]:
pipes.append({'PIPE_NAME': 'pipe_sts_ranker_minilm',
              'PIPE_OBJECT': pipe_sts_ranker_minilm,
              'RETRIEVER_TYPE': 'sts',  # in ['sts', 'bm25']
              'RETRIEVER_MODEL_NAME': nome_modelo_embedding_model_sts, # or nome_modelo_embedding_model_sts_rufimelo,
              'RANKER_MODEL_NAME': nome_modelo_ranking_minilm})

In [None]:
doctos_retornados_ranker = pipe_sts_ranker_minilm.run(query=parm_query)
util_search.detail_document_found(doctos_retornados_ranker)

Parâmetros usados: {}
Consulta: A transferência de documentos da entidade para local impróprio ao armazenamento, causando a perda de informações ou inviabilizando seu manuseio, de forma a impedir a atuação do TCU, é causa de responsabilização do gestor que a ordenou.
Qtd documentos retornados: 10
Primeiro docto:
<Document: id=1110487, content='O termo é "Vazamento de dados".
Vazamento de dados tem definição: "Transmissão não-autorizada de dad...'>

Último (10):
<Document: id=15939, content='O termo é "Erro de procedimento".
Erro de procedimento tem definição: "É um vício de forma, extrínse...'>
Seguem os nomes dos termos recuperados em ordem de score
0 : ['Vazamento de dados', 0.0024509401991963387]
1 : ['Termo de sigilo', 0.0007997120846994221]
2 : ['Trancamento das contas', 0.0002845456183422357]
3 : ['Risco de controle', 0.00022979704954195768]
4 : ['Revisão de ofício', 0.00013715452223550528]
5 : ['Responsabilidade perante o controle externo', 0.00013104191748425364]
6 : ['Anulação

### pipe_sts_ranker_monot5_3b

In [42]:
pipe_sts_ranker_monot5_3b = util_search.return_pipeline_sts_reranker(index, 'MONOT5', parm_path_model_ranker=nome_caminho_modelo_monot5_3b, parm_path_model_sts=nome_caminho_modelo_sts, parm_limit_query_size=350)

In [43]:
pipes.append({'PIPE_NAME': 'pipe_sts_ranker_monot5_3b',
              'PIPE_OBJECT': pipe_sts_ranker_monot5_3b,
              'RETRIEVER_TYPE': 'sts',  # in ['sts', 'bm25']
              'RETRIEVER_MODEL_NAME': nome_modelo_embedding_model_sts, # or nome_modelo_embedding_model_sts_rufimelo,
              'RANKER_MODEL_NAME': nome_modelo_monot5_3b})

In [44]:
doctos_retornados_ranker = pipe_sts_ranker_monot5_3b.run(query=parm_query)
util_search.detail_document_found(doctos_retornados_ranker)

Parâmetros usados: {}
Consulta: A transferência de documentos da entidade para local impróprio ao armazenamento, causando a perda de informações ou inviabilizando seu manuseio, de forma a impedir a atuação do TCU, é causa de responsabilização do gestor que a ordenou.
Qtd documentos retornados: 10
Primeiro docto:
<Document: id=5098, content='O termo é "Responsabilidade perante o controle externo".
Responsabilidade perante o controle externo...'>

Último (10):
<Document: id=83, content='O termo é "Anulação". Anulação tem definição: "É o ato ou a decisão, de caráter judicial ou administ...'>
Seguem os nomes dos termos recuperados em ordem de score
0 : ['Responsabilidade perante o controle externo', -0.29301929473876953]
1 : ['Termo de sigilo', -0.6637465357780457]
2 : ['Afastamento de responsável', -0.8156272172927856]
3 : ['Revisão de ofício', -0.9923231601715088]
4 : ['Trancamento das contas', -1.1499707698822021]
5 : ['Erro de procedimento', -2.8979315757751465]
6 : ['Vazamento de dado

# Experiments

## Rodar monoT5_3b: bm25

### topk_ranker 100

In [None]:
pipes

[{'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b',
  'PIPE_OBJECT': <haystack.pipelines.base.Pipeline at 0x7fd807582b50>,
  'RETRIEVER_TYPE': 'bm25',
  'RETRIEVER_MODEL_NAME': '',
  'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}]

In [None]:
grid_experiment = {'CRITERIA' : ['total', 'total_gte_5'],
             'TOPK_RETRIEVER' : [300, 200, 100],
             'TOPK_RANKER' : [100],
             'PIPE': pipes,
             'DONE': [False]
}

In [None]:
list_experiment = [dict(zip(grid_experiment.keys(), values)) for values in product(*grid_experiment.values())]


In [None]:
print(len(list_experiment))
gc.collect()

6


103

In [None]:
import importlib
importlib.reload(util_search)

dict_idcg_relevance_fixed {1: 1.0, 2: 1.6309297535714575, 3: 2.1309297535714578, 4: 2.5616063116448506, 5: 2.9484591188793923, 6: 3.3046663059874146, 7: 3.637999639320748, 8: 3.953464516106477, 9: 4.254494511770458, 10: 4.543559338088346, 11: 4.8225022837394755, 12: 5.092740438166795, 13: 5.355389973203989, 14: 5.611347998013804}


<module 'util.util_search' from '/home/borela/fontes/ind-ir/code/util/util_search.py'>

In [None]:
%%time
# colocar pequeno só para testes, depois voltar para 9999999 (número maior do que o número de termos a pesquisar)
limit_query = 100 # 9999999
for cnt, experiment in enumerate(list_experiment):
    if not experiment['DONE']:
        list_result_experiment = [] # por experiment
        # no caso de normas segecex, bastaria 224 para qtd5 e 891 para qtd1
        print(f"==={cnt}/{len(list_experiment)}===>  {time.strftime('%Y-%b-%d %H:%M:%S')} experiment: {experiment}")
        result_run = util_search.experiment_run(parm_df=df_search_data, 
                                                parm_experiment=experiment,
                                                parm_ndcg_position=12,
                                                parm_limit_query=limit_query,
                                                parm_print=True)
        list_result_experiment.append(result_run)
        # print(f"qtd_encontrado sim:{result_run['qtd_encontrado']}, não:{result_run['qtd_nao_encontrado']} ({result_run['percent_nao_encontrado']}%)")
        experiment['DONE'] = True
        util_search.add_experiment_result(parm_list_result=list_result_experiment, 
                                    parm_path_experiment= PATH_SEARCH_EXPERIMENT,
                                    parm_path_experiment_result= PATH_SEARCH_RESULT)



===0/6===>  2023-Jun-11 19:44:36 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7fd807582b50>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [35:03<00:21, 21.25s/it]


RANK1_MEAN: 4.1
NDCG_MEAN: 39.937
TIME_SPENT_MEAN: 21.039
===1/6===>  2023-Jun-11 20:19:40 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7fd807582b50>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [23:58<00:14, 14.53s/it]


RANK1_MEAN: 3.52
NDCG_MEAN: 38.97
TIME_SPENT_MEAN: 14.389
===2/6===>  2023-Jun-11 20:43:39 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7fd807582b50>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [12:00<00:07,  7.28s/it]


RANK1_MEAN: 2.87
NDCG_MEAN: 38.851
TIME_SPENT_MEAN: 7.206
===3/6===>  2023-Jun-11 20:55:40 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7fd807582b50>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [35:48<00:21, 21.70s/it]


RANK1_MEAN: 3.16
NDCG_MEAN: 39.915
TIME_SPENT_MEAN: 21.487
===4/6===>  2023-Jun-11 21:31:28 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7fd807582b50>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [24:01<00:14, 14.56s/it]


RANK1_MEAN: 2.81
NDCG_MEAN: 40.249
TIME_SPENT_MEAN: 14.419
===5/6===>  2023-Jun-11 21:55:31 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7fd807582b50>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [12:06<00:07,  7.33s/it]

RANK1_MEAN: 2.82
NDCG_MEAN: 39.205
TIME_SPENT_MEAN: 7.26
CPU times: user 1h 26min 8s, sys: 55min 59s, total: 2h 22min 7s
Wall time: 2h 23min 1s





### topk_ranker 50

In [None]:
pipes

[{'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b',
  'PIPE_OBJECT': <haystack.pipelines.base.Pipeline at 0x7f3ff38305d0>,
  'RETRIEVER_TYPE': 'bm25',
  'RETRIEVER_MODEL_NAME': '',
  'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}]

In [None]:
grid_experiment = {
             'CRITERIA' : ['total', 'total_gte_5'],
             'TOPK_RETRIEVER' : [400, 300, 200, 100],
             'TOPK_RANKER' : [50],
             'DONE': [False],
             'PIPE': pipes,
}

In [None]:
list_experiment = [dict(zip(grid_experiment.keys(), values)) for values in product(*grid_experiment.values())]


In [None]:
print(len(list_experiment))
gc.collect()

8


114

In [None]:
import importlib
importlib.reload(util_search)

dict_idcg_relevance_fixed {1: 1.0, 2: 1.6309297535714575, 3: 2.1309297535714578, 4: 2.5616063116448506, 5: 2.9484591188793923, 6: 3.3046663059874146, 7: 3.637999639320748, 8: 3.953464516106477, 9: 4.254494511770458, 10: 4.543559338088346, 11: 4.8225022837394755, 12: 5.092740438166795, 13: 5.355389973203989, 14: 5.611347998013804}


<module 'util.util_search' from '/home/borela/fontes/ind-ir/code/util/util_search.py'>

In [None]:
%%time
# colocar pequeno só para testes, depois voltar para 9999999 (número maior do que o número de termos a pesquisar)
limit_query = 100 # 9999999
for cnt, experiment in enumerate(list_experiment):
    if not experiment['DONE']:
        list_result_experiment = [] # por experiment
        # no caso de normas segecex, bastaria 224 para qtd5 e 891 para qtd1
        print(f"==={cnt}/{len(list_experiment)}===>  {time.strftime('%Y-%b-%d %H:%M:%S')} experiment: {experiment}")
        result_run = util_search.experiment_run(parm_df=df_search_data, 
                                                parm_experiment=experiment,
                                                parm_ndcg_position=12,
                                                parm_limit_query=limit_query,
                                                parm_print=True)
        list_result_experiment.append(result_run)
        # print(f"qtd_encontrado sim:{result_run['qtd_encontrado']}, não:{result_run['qtd_nao_encontrado']} ({result_run['percent_nao_encontrado']}%)")
        experiment['DONE'] = True
        util_search.add_experiment_result(parm_list_result=list_result_experiment, 
                                    parm_path_experiment= PATH_SEARCH_EXPERIMENT,
                                    parm_path_experiment_result= PATH_SEARCH_RESULT)



===0/8===>  2023-Jun-12 11:37:50 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f3ff38305d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [46:28<00:28, 28.17s/it]


RANK1_MEAN: 3.57
NDCG_MEAN: 39.843
TIME_SPENT_MEAN: 27.889
===1/8===>  2023-Jun-12 12:24:19 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f3ff38305d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [35:27<00:21, 21.49s/it]


RANK1_MEAN: 4.1
NDCG_MEAN: 39.937
TIME_SPENT_MEAN: 21.274
===2/8===>  2023-Jun-12 12:59:47 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f3ff38305d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [23:48<00:14, 14.43s/it]


RANK1_MEAN: 3.52
NDCG_MEAN: 38.97
TIME_SPENT_MEAN: 14.286
===3/8===>  2023-Jun-12 13:23:36 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f3ff38305d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [11:57<00:07,  7.24s/it]


RANK1_MEAN: 2.87
NDCG_MEAN: 38.851
TIME_SPENT_MEAN: 7.172
===4/8===>  2023-Jun-12 13:35:33 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f3ff38305d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [47:07<00:28, 28.56s/it]


RANK1_MEAN: 3.6
NDCG_MEAN: 39.936
TIME_SPENT_MEAN: 28.278
===5/8===>  2023-Jun-12 14:22:41 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f3ff38305d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [35:29<00:21, 21.51s/it]


RANK1_MEAN: 3.16
NDCG_MEAN: 39.915
TIME_SPENT_MEAN: 21.299
===6/8===>  2023-Jun-12 14:58:12 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f3ff38305d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [23:48<00:14, 14.43s/it]


RANK1_MEAN: 2.81
NDCG_MEAN: 40.249
TIME_SPENT_MEAN: 14.283
===7/8===>  2023-Jun-12 15:22:00 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f3ff38305d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [11:59<00:07,  7.27s/it]


RANK1_MEAN: 2.82
NDCG_MEAN: 39.205
TIME_SPENT_MEAN: 7.197
CPU times: user 2h 20min 58s, sys: 1h 33min 51s, total: 3h 54min 50s
Wall time: 3h 56min 9s


## Rodar mono5-3b:sts

### topk_ranker 100

In [None]:
pipes

[{'PIPE_NAME': 'pipe_sts_ranker_monot5_3b',
  'PIPE_OBJECT': <haystack.pipelines.base.Pipeline at 0x7f45196091d0>,
  'RETRIEVER_TYPE': 'sts',
  'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3',
  'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}]

In [None]:
grid_experiment = {'CRITERIA' : ['total', 'total_gte_5'],
             'TOPK_RETRIEVER' : [300, 200, 100],
             'TOPK_RANKER' : [100],
             'PIPE': pipes,
             'DONE': [False]
}

In [None]:
list_experiment = [dict(zip(grid_experiment.keys(), values)) for values in product(*grid_experiment.values())]


In [None]:
print(len(list_experiment))
gc.collect()

6


110

In [None]:
import importlib
importlib.reload(util_search)

dict_idcg_relevance_fixed {1: 1.0, 2: 1.6309297535714575, 3: 2.1309297535714578, 4: 2.5616063116448506, 5: 2.9484591188793923, 6: 3.3046663059874146, 7: 3.637999639320748, 8: 3.953464516106477, 9: 4.254494511770458, 10: 4.543559338088346, 11: 4.8225022837394755, 12: 5.092740438166795, 13: 5.355389973203989, 14: 5.611347998013804}


<module 'util.util_search' from '/home/borela/fontes/ind-ir/code/util/util_search.py'>

In [None]:
%%time
# colocar pequeno só para testes, depois voltar para 9999999 (número maior do que o número de termos a pesquisar)
limit_query = 100 # 9999999
for cnt, experiment in enumerate(list_experiment):
    if not experiment['DONE']:
        list_result_experiment = [] # por experiment
        # no caso de normas segecex, bastaria 224 para qtd5 e 891 para qtd1
        print(f"==={cnt}/{len(list_experiment)}===>  {time.strftime('%Y-%b-%d %H:%M:%S')} experiment: {experiment}")
        result_run = util_search.experiment_run(parm_df=df_search_data, 
                                                parm_experiment=experiment,
                                                parm_ndcg_position=12,
                                                parm_limit_query=limit_query,
                                                parm_print=True)
        list_result_experiment.append(result_run)
        # print(f"qtd_encontrado sim:{result_run['qtd_encontrado']}, não:{result_run['qtd_nao_encontrado']} ({result_run['percent_nao_encontrado']}%)")
        experiment['DONE'] = True
        util_search.add_experiment_result(parm_list_result=list_result_experiment, 
                                    parm_path_experiment= PATH_SEARCH_EXPERIMENT,
                                    parm_path_experiment_result= PATH_SEARCH_RESULT)



===0/6===>  2023-Jun-12 07:11:01 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f45196091d0>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [33:37<00:20, 20.38s/it]


RANK1_MEAN: 3.81
NDCG_MEAN: 36.022
TIME_SPENT_MEAN: 20.177
===1/6===>  2023-Jun-12 07:44:39 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f45196091d0>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [23:00<00:13, 13.94s/it]


RANK1_MEAN: 3.859
NDCG_MEAN: 35.575
TIME_SPENT_MEAN: 13.801
===2/6===>  2023-Jun-12 08:07:39 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f45196091d0>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [11:39<00:07,  7.07s/it]


RANK1_MEAN: 3.547
NDCG_MEAN: 32.427
TIME_SPENT_MEAN: 6.998
===3/6===>  2023-Jun-12 08:19:19 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f45196091d0>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [34:37<00:20, 20.98s/it]


RANK1_MEAN: 3.54
NDCG_MEAN: 36.67
TIME_SPENT_MEAN: 20.773
===4/6===>  2023-Jun-12 08:53:57 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f45196091d0>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [23:14<00:14, 14.09s/it]


RANK1_MEAN: 3.33
NDCG_MEAN: 36.033
TIME_SPENT_MEAN: 13.947
===5/6===>  2023-Jun-12 09:17:11 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f45196091d0>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}, 'DONE': False}


 99%|█████████▉| 99/100 [11:45<00:07,  7.12s/it]

RANK1_MEAN: 2.646
NDCG_MEAN: 35.36
TIME_SPENT_MEAN: 7.05
CPU times: user 1h 19min 45s, sys: 57min 19s, total: 2h 17min 5s
Wall time: 2h 17min 55s





### topk_ranker 50

In [45]:
pipes

[{'PIPE_NAME': 'pipe_sts_ranker_monot5_3b',
  'PIPE_OBJECT': <haystack.pipelines.base.Pipeline at 0x7f9037d80290>,
  'RETRIEVER_TYPE': 'sts',
  'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3',
  'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}]

In [46]:
grid_experiment = {
             'CRITERIA' : ['total', 'total_gte_5'],
             'TOPK_RETRIEVER' : [400, 300, 200, 100],
             'TOPK_RANKER' : [50],
             'DONE': [False],
             'PIPE': pipes,
}

In [47]:
list_experiment = [dict(zip(grid_experiment.keys(), values)) for values in product(*grid_experiment.values())]


In [48]:
print(len(list_experiment))
gc.collect()

8


101

In [None]:
import importlib
importlib.reload(util_search)

dict_idcg_relevance_fixed {1: 1.0, 2: 1.6309297535714575, 3: 2.1309297535714578, 4: 2.5616063116448506, 5: 2.9484591188793923, 6: 3.3046663059874146, 7: 3.637999639320748, 8: 3.953464516106477, 9: 4.254494511770458, 10: 4.543559338088346, 11: 4.8225022837394755, 12: 5.092740438166795, 13: 5.355389973203989, 14: 5.611347998013804}


<module 'util.util_search' from '/home/borela/fontes/ind-ir/code/util/util_search.py'>

In [49]:
%%time
# colocar pequeno só para testes, depois voltar para 9999999 (número maior do que o número de termos a pesquisar)
limit_query = 100 # 9999999
for cnt, experiment in enumerate(list_experiment):
    if not experiment['DONE']:
        list_result_experiment = [] # por experiment
        # no caso de normas segecex, bastaria 224 para qtd5 e 891 para qtd1
        print(f"==={cnt}/{len(list_experiment)}===>  {time.strftime('%Y-%b-%d %H:%M:%S')} experiment: {experiment}")
        result_run = util_search.experiment_run(parm_df=df_search_data, 
                                                parm_experiment=experiment,
                                                parm_ndcg_position=12,
                                                parm_limit_query=limit_query,
                                                parm_print=True)
        list_result_experiment.append(result_run)
        # print(f"qtd_encontrado sim:{result_run['qtd_encontrado']}, não:{result_run['qtd_nao_encontrado']} ({result_run['percent_nao_encontrado']}%)")
        experiment['DONE'] = True
        util_search.add_experiment_result(parm_list_result=list_result_experiment, 
                                    parm_path_experiment= PATH_SEARCH_EXPERIMENT,
                                    parm_path_experiment_result= PATH_SEARCH_RESULT)



===0/8===>  2023-Jun-12 17:05:59 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f9037d80290>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [44:46<00:27, 27.13s/it]


RANK1_MEAN: 3.87
NDCG_MEAN: 36.152
TIME_SPENT_MEAN: 26.863
===1/8===>  2023-Jun-12 17:50:46 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f9037d80290>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [34:07<00:20, 20.68s/it]


RANK1_MEAN: 3.81
NDCG_MEAN: 36.022
TIME_SPENT_MEAN: 20.474
===2/8===>  2023-Jun-12 18:24:53 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f9037d80290>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [22:53<00:13, 13.88s/it]


RANK1_MEAN: 3.859
NDCG_MEAN: 35.575
TIME_SPENT_MEAN: 13.738
===3/8===>  2023-Jun-12 18:47:48 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f9037d80290>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [11:35<00:07,  7.02s/it]


RANK1_MEAN: 3.547
NDCG_MEAN: 32.427
TIME_SPENT_MEAN: 6.953
===4/8===>  2023-Jun-12 18:59:23 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f9037d80290>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [45:39<00:27, 27.67s/it]


RANK1_MEAN: 4.25
NDCG_MEAN: 35.842
TIME_SPENT_MEAN: 27.392
===5/8===>  2023-Jun-12 19:45:03 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f9037d80290>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [34:37<00:20, 20.98s/it]


RANK1_MEAN: 3.54
NDCG_MEAN: 36.67
TIME_SPENT_MEAN: 20.772
===6/8===>  2023-Jun-12 20:19:40 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f9037d80290>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [23:12<00:14, 14.06s/it]


RANK1_MEAN: 3.33
NDCG_MEAN: 36.033
TIME_SPENT_MEAN: 13.921
===7/8===>  2023-Jun-12 20:42:53 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 50, 'DONE': False, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_monot5_3b', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f9037d80290>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mt5-3B-mmarco-en-pt'}}


 99%|█████████▉| 99/100 [11:42<00:07,  7.10s/it]


RANK1_MEAN: 2.646
NDCG_MEAN: 35.36
TIME_SPENT_MEAN: 7.024
CPU times: user 2h 8min 33s, sys: 1h 38min 43s, total: 3h 47min 16s
Wall time: 3h 48min 36s


## Rodar minilm: bm25 e sts

In [None]:
pipes

[{'PIPE_NAME': 'pipe_bm25_ranker_minilm',
  'PIPE_OBJECT': <haystack.pipelines.base.Pipeline at 0x7f25f350d3d0>,
  'RETRIEVER_TYPE': 'bm25',
  'RETRIEVER_MODEL_NAME': '',
  'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'},
 {'PIPE_NAME': 'pipe_sts_ranker_minilm',
  'PIPE_OBJECT': <haystack.pipelines.base.Pipeline at 0x7f25f1badf10>,
  'RETRIEVER_TYPE': 'sts',
  'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3',
  'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}]

In [None]:
grid_experiment = {'CRITERIA' : ['total', 'total_gte_5'],
             'TOPK_RETRIEVER' : [300, 200, 100],
             'TOPK_RANKER' : [100],
             'PIPE': pipes,
             'DONE': [False]
}

In [None]:
list_experiment = [dict(zip(grid_experiment.keys(), values)) for values in product(*grid_experiment.values())]


In [None]:
print(len(list_experiment))
gc.collect()

12


63

In [None]:
import importlib
importlib.reload(util_search)

<module 'util.util_search' from '/home/borela/fontes/ind-ir/code/util/util_search.py'>

In [None]:
%%time
# colocar pequeno só para testes, depois voltar para 9999999 (número maior do que o número de termos a pesquisar)
limit_query = 100 # 9999999
for cnt, experiment in enumerate(list_experiment):
    if not experiment['DONE']:
        list_result_experiment = [] # por experiment
        # no caso de normas segecex, bastaria 224 para qtd5 e 891 para qtd1
        print(f"==={cnt}/{len(list_experiment)}===>  {time.strftime('%Y-%b-%d %H:%M:%S')} experiment: {experiment}")
        result_run = util_search.experiment_run(parm_df=df_search_data, 
                                                parm_experiment=experiment,
                                                parm_ndcg_position=12,
                                                parm_limit_query=limit_query,
                                                parm_print=True)
        list_result_experiment.append(result_run)
        # print(f"qtd_encontrado sim:{result_run['qtd_encontrado']}, não:{result_run['qtd_nao_encontrado']} ({result_run['percent_nao_encontrado']}%)")
        experiment['DONE'] = True
        util_search.add_experiment_result(parm_list_result=list_result_experiment, 
                                    parm_path_experiment= PATH_SEARCH_EXPERIMENT,
                                    parm_path_experiment_result= PATH_SEARCH_RESULT)



===0/12===>  2023-Jun-12 10:02:16 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:20<00:00,  1.23it/s]


RANK1_MEAN: 8.626
NDCG_MEAN: 24.26
TIME_SPENT_MEAN: 0.807
===1/12===>  2023-Jun-12 10:03:37 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:20<00:00,  1.23it/s]


RANK1_MEAN: 8.505
NDCG_MEAN: 22.769
TIME_SPENT_MEAN: 0.807
===2/12===>  2023-Jun-12 10:04:57 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:54<00:00,  1.82it/s]


RANK1_MEAN: 8.808
NDCG_MEAN: 24.414
TIME_SPENT_MEAN: 0.545
===3/12===>  2023-Jun-12 10:05:52 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:55<00:00,  1.80it/s]


RANK1_MEAN: 8.866
NDCG_MEAN: 22.796
TIME_SPENT_MEAN: 0.55
===4/12===>  2023-Jun-12 10:06:47 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:28<00:00,  3.53it/s]


RANK1_MEAN: 8.68
NDCG_MEAN: 24.616
TIME_SPENT_MEAN: 0.28
===5/12===>  2023-Jun-12 10:07:16 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:29<00:00,  3.37it/s]


RANK1_MEAN: 8.432
NDCG_MEAN: 21.228
TIME_SPENT_MEAN: 0.294
===6/12===>  2023-Jun-12 10:07:45 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:21<00:00,  1.21it/s]


RANK1_MEAN: 6.33
NDCG_MEAN: 24.784
TIME_SPENT_MEAN: 0.816
===7/12===>  2023-Jun-12 10:09:07 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:21<00:00,  1.22it/s]


RANK1_MEAN: 7.639
NDCG_MEAN: 24.093
TIME_SPENT_MEAN: 0.811
===8/12===>  2023-Jun-12 10:10:28 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:54<00:00,  1.80it/s]


RANK1_MEAN: 7.58
NDCG_MEAN: 24.989
TIME_SPENT_MEAN: 0.549
===9/12===>  2023-Jun-12 10:11:23 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:55<00:00,  1.79it/s]


RANK1_MEAN: 8.475
NDCG_MEAN: 23.86
TIME_SPENT_MEAN: 0.552
===10/12===>  2023-Jun-12 10:12:19 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:28<00:00,  3.52it/s]


RANK1_MEAN: 7.04
NDCG_MEAN: 25.979
TIME_SPENT_MEAN: 0.281
===11/12===>  2023-Jun-12 10:12:47 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 100, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:29<00:00,  3.41it/s]


RANK1_MEAN: 5.865
NDCG_MEAN: 23.582
TIME_SPENT_MEAN: 0.29
CPU times: user 12min 22s, sys: 3.99 s, total: 12min 26s
Wall time: 11min


In [None]:
pipes

[{'PIPE_NAME': 'pipe_bm25_ranker_minilm',
  'PIPE_OBJECT': <haystack.pipelines.base.Pipeline at 0x7f25f350d3d0>,
  'RETRIEVER_TYPE': 'bm25',
  'RETRIEVER_MODEL_NAME': '',
  'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'},
 {'PIPE_NAME': 'pipe_sts_ranker_minilm',
  'PIPE_OBJECT': <haystack.pipelines.base.Pipeline at 0x7f25f1badf10>,
  'RETRIEVER_TYPE': 'sts',
  'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3',
  'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}]

In [None]:
grid_experiment = {'CRITERIA' : ['total', 'total_gte_5'],
             'TOPK_RETRIEVER' : [400, 300, 200, 100],
             'TOPK_RANKER' : [200, 50],
             'PIPE': pipes,
             'DONE': [False]
}

In [None]:
list_experiment = [dict(zip(grid_experiment.keys(), values)) for values in product(*grid_experiment.values())]


In [None]:
print(len(list_experiment))
gc.collect()

32


84

In [None]:
import importlib
importlib.reload(util_search)

<module 'util.util_search' from '/home/borela/fontes/ind-ir/code/util/util_search.py'>

In [None]:
%%time
# colocar pequeno só para testes, depois voltar para 9999999 (número maior do que o número de termos a pesquisar)
limit_query = 100 # 9999999
for cnt, experiment in enumerate(list_experiment):
    if not experiment['DONE']:
        list_result_experiment = [] # por experiment
        # no caso de normas segecex, bastaria 224 para qtd5 e 891 para qtd1
        print(f"==={cnt}/{len(list_experiment)}===>  {time.strftime('%Y-%b-%d %H:%M:%S')} experiment: {experiment}")
        result_run = util_search.experiment_run(parm_df=df_search_data, 
                                                parm_experiment=experiment,
                                                parm_ndcg_position=12,
                                                parm_limit_query=limit_query,
                                                parm_print=True)
        list_result_experiment.append(result_run)
        # print(f"qtd_encontrado sim:{result_run['qtd_encontrado']}, não:{result_run['qtd_nao_encontrado']} ({result_run['percent_nao_encontrado']}%)")
        experiment['DONE'] = True
        util_search.add_experiment_result(parm_list_result=list_result_experiment, 
                                    parm_path_experiment= PATH_SEARCH_EXPERIMENT,
                                    parm_path_experiment_result= PATH_SEARCH_RESULT)



===0/32===>  2023-Jun-12 10:13:17 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:48<00:01,  1.10s/it]


RANK1_MEAN: 11.13
NDCG_MEAN: 24.237
TIME_SPENT_MEAN: 1.085
===1/32===>  2023-Jun-12 10:15:06 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:47<00:01,  1.09s/it]


RANK1_MEAN: 11.848
NDCG_MEAN: 23.229
TIME_SPENT_MEAN: 1.075
===2/32===>  2023-Jun-12 10:16:53 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:47<00:01,  1.09s/it]


RANK1_MEAN: 6.295
NDCG_MEAN: 24.237
TIME_SPENT_MEAN: 1.076
===3/32===>  2023-Jun-12 10:18:41 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:47<00:01,  1.08s/it]


RANK1_MEAN: 5.839
NDCG_MEAN: 23.229
TIME_SPENT_MEAN: 1.071
===4/32===>  2023-Jun-12 10:20:29 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:21<00:00,  1.22it/s]


RANK1_MEAN: 9.73
NDCG_MEAN: 24.26
TIME_SPENT_MEAN: 0.815
===5/32===>  2023-Jun-12 10:21:50 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:21<00:00,  1.22it/s]


RANK1_MEAN: 12.58
NDCG_MEAN: 22.769
TIME_SPENT_MEAN: 0.815
===6/32===>  2023-Jun-12 10:23:12 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:21<00:00,  1.22it/s]


RANK1_MEAN: 6.396
NDCG_MEAN: 24.26
TIME_SPENT_MEAN: 0.811
===7/32===>  2023-Jun-12 10:24:33 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:20<00:00,  1.22it/s]


RANK1_MEAN: 5.699
NDCG_MEAN: 22.769
TIME_SPENT_MEAN: 0.81
===8/32===>  2023-Jun-12 10:25:54 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:54<00:00,  1.81it/s]


RANK1_MEAN: 9.98
NDCG_MEAN: 24.414
TIME_SPENT_MEAN: 0.547
===9/32===>  2023-Jun-12 10:26:49 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:55<00:00,  1.78it/s]


RANK1_MEAN: 11.465
NDCG_MEAN: 22.796
TIME_SPENT_MEAN: 0.555
===10/32===>  2023-Jun-12 10:27:45 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:54<00:00,  1.82it/s]


RANK1_MEAN: 5.29
NDCG_MEAN: 24.414
TIME_SPENT_MEAN: 0.545
===11/32===>  2023-Jun-12 10:28:40 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:55<00:00,  1.80it/s]


RANK1_MEAN: 5.772
NDCG_MEAN: 22.796
TIME_SPENT_MEAN: 0.55
===12/32===>  2023-Jun-12 10:29:35 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:27<00:00,  3.55it/s]


RANK1_MEAN: 8.68
NDCG_MEAN: 24.616
TIME_SPENT_MEAN: 0.279
===13/32===>  2023-Jun-12 10:30:03 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:29<00:00,  3.40it/s]


RANK1_MEAN: 8.432
NDCG_MEAN: 21.228
TIME_SPENT_MEAN: 0.291
===14/32===>  2023-Jun-12 10:30:33 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:27<00:00,  3.56it/s]


RANK1_MEAN: 6.208
NDCG_MEAN: 24.616
TIME_SPENT_MEAN: 0.278
===15/32===>  2023-Jun-12 10:31:01 experiment: {'CRITERIA': 'total', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:28<00:00,  3.43it/s]


RANK1_MEAN: 6.283
NDCG_MEAN: 21.228
TIME_SPENT_MEAN: 0.289
===16/32===>  2023-Jun-12 10:31:30 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:48<00:01,  1.09s/it]


RANK1_MEAN: 9.79
NDCG_MEAN: 24.712
TIME_SPENT_MEAN: 1.084
===17/32===>  2023-Jun-12 10:33:18 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:47<00:01,  1.08s/it]


RANK1_MEAN: 11.182
NDCG_MEAN: 23.659
TIME_SPENT_MEAN: 1.073
===18/32===>  2023-Jun-12 10:35:06 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:47<00:01,  1.09s/it]


RANK1_MEAN: 5.396
NDCG_MEAN: 24.712
TIME_SPENT_MEAN: 1.079
===19/32===>  2023-Jun-12 10:36:54 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 400, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:47<00:01,  1.08s/it]


RANK1_MEAN: 5.84
NDCG_MEAN: 23.659
TIME_SPENT_MEAN: 1.071
===20/32===>  2023-Jun-12 10:38:41 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:21<00:00,  1.21it/s]


RANK1_MEAN: 9.38
NDCG_MEAN: 24.784
TIME_SPENT_MEAN: 0.816
===21/32===>  2023-Jun-12 10:40:03 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:21<00:00,  1.21it/s]


RANK1_MEAN: 11.51
NDCG_MEAN: 24.093
TIME_SPENT_MEAN: 0.815
===22/32===>  2023-Jun-12 10:41:25 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:21<00:00,  1.22it/s]


RANK1_MEAN: 5.615
NDCG_MEAN: 24.784
TIME_SPENT_MEAN: 0.812
===23/32===>  2023-Jun-12 10:42:46 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 300, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [01:20<00:00,  1.22it/s]


RANK1_MEAN: 4.72
NDCG_MEAN: 24.093
TIME_SPENT_MEAN: 0.81
===24/32===>  2023-Jun-12 10:44:08 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:55<00:00,  1.79it/s]


RANK1_MEAN: 7.58
NDCG_MEAN: 24.989
TIME_SPENT_MEAN: 0.553
===25/32===>  2023-Jun-12 10:45:03 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:55<00:00,  1.79it/s]


RANK1_MEAN: 9.6
NDCG_MEAN: 23.86
TIME_SPENT_MEAN: 0.555
===26/32===>  2023-Jun-12 10:45:59 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:54<00:00,  1.81it/s]


RANK1_MEAN: 4.99
NDCG_MEAN: 24.989
TIME_SPENT_MEAN: 0.548
===27/32===>  2023-Jun-12 10:46:54 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 200, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:55<00:00,  1.80it/s]


RANK1_MEAN: 5.117
NDCG_MEAN: 23.86
TIME_SPENT_MEAN: 0.551
===28/32===>  2023-Jun-12 10:47:49 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:28<00:00,  3.51it/s]


RANK1_MEAN: 7.04
NDCG_MEAN: 25.979
TIME_SPENT_MEAN: 0.282
===29/32===>  2023-Jun-12 10:48:18 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 200, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:29<00:00,  3.40it/s]


RANK1_MEAN: 5.865
NDCG_MEAN: 23.582
TIME_SPENT_MEAN: 0.291
===30/32===>  2023-Jun-12 10:48:47 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_bm25_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f350d3d0>, 'RETRIEVER_TYPE': 'bm25', 'RETRIEVER_MODEL_NAME': '', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:27<00:00,  3.54it/s]


RANK1_MEAN: 5.867
NDCG_MEAN: 25.979
TIME_SPENT_MEAN: 0.279
===31/32===>  2023-Jun-12 10:49:15 experiment: {'CRITERIA': 'total_gte_5', 'TOPK_RETRIEVER': 100, 'TOPK_RANKER': 50, 'PIPE': {'PIPE_NAME': 'pipe_sts_ranker_minilm', 'PIPE_OBJECT': <haystack.pipelines.base.Pipeline object at 0x7f25f1badf10>, 'RETRIEVER_TYPE': 'sts', 'RETRIEVER_MODEL_NAME': 'rufimelo/Legal-BERTimbau-sts-large-ma-v3', 'RANKER_MODEL_NAME': 'unicamp-dl/mMiniLM-L6-v2-pt-v2'}, 'DONE': False}


 99%|█████████▉| 99/100 [00:28<00:00,  3.43it/s]


RANK1_MEAN: 5.305
NDCG_MEAN: 23.582
TIME_SPENT_MEAN: 0.289
CPU times: user 40min 58s, sys: 10.7 s, total: 41min 9s
Wall time: 36min 27s
