In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# Embedding generation

In [7]:
# Embedding model
from FlagEmbedding import BGEM3FlagModel

model = BGEM3FlagModel('BAAI/bge-m3',
                       use_fp16=False) # Setting use_fp16 to True speeds up computation with a slight performance degradation

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 30 files: 100%|███████████████████████████████████████████████████████████████| 30/30 [01:30<00:00,  3.02s/it]
  return self.fget.__get__(instance, owner)()


## Reading pickle from the QuestGen notebook with the questions and the subset of texts

In [None]:
## This are the 3448 using as a test
meds = pd.read_pickle("./meds_3448.pkl")

In [41]:
## We pull apart the questions in other set
preguntas = meds[['id', 'pagina','Pregunta']][meds.Pregunta != 0]

## Generating Sparse Embedding (Lexical Weight), Dense y Multi-Vector (ColBERT) embeddings for every question and text

In [None]:
# Generation over texts
text_embs = []
for i in tqdm(range(len(meds))):
    try:
        text_embs.append(model.encode(meds.texto[i].lower(),
                                      batch_size=12,
                                      max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                                      return_dense=True, return_sparse=True, return_colbert_vecs=True))
    except:
        text_embs.append('NoInfo')

In [None]:
## Appending the embeddings as a column
meds['3Embs'] = text_embs

In [None]:
# Generation over questions
preg_embs = []
for i in tqdm(preguntas.Pregunta):
    try:
        preg_embs.append(model.encode(i.lower(),
                                      batch_size=12,
                                      max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                                      return_dense=True, return_sparse=True, return_colbert_vecs=True))
    except:
        preg_embs.append('NoInfo')

In [None]:
## Appending the embeddings as a column
preguntas['3Embs'] = preg_embs

# Retriever


For the 65 generated questions and the 3448 texts we are going to use the 3 different scores provided by BM3 to retrieve and compare their performance:

* Sparse Embedding (Lexical Weight) -> model.compute_lexical_matching_score
* Multi-Vector (ColBERT) -> model.colbert_score
* Dense -> dot product
* sparse+dense
* sparse+dense+colbert

In [8]:
# Computing the 3 embeddings and performing the scores
ids_lexic = []
ids_dense = []
ids_colbe = []
ids_recup_lex_den = []
ids_recup_lex_den_col = []
for i in tqdm(range(len(preguntas.Pregunta))):## corriendo en las preguntas
    lexic_d = []
    dense_d = []
    colbe_d = []
    lex_den_d = []
    lex_den_col_d = []
    for j in range(len(meds.texto)): ## corriendo en los contextos
        try:
            lex_score = model.compute_lexical_matching_score(preguntas['3Embs'][i]['lexical_weights'], meds['3Embs'][j]['lexical_weights'])
            dense_score = preguntas['3Embs'][i]['dense_vecs'] @ meds['3Embs'][j]['dense_vecs'].T
            colb_score = model.colbert_score(preguntas['3Embs'][i]['colbert_vecs'], meds['3Embs'][j]['colbert_vecs'])
            lex_dense = lex_score + dense_score
            lex_dense_colb = colb_score + lex_score + dense_score
            id = str(meds.id[j])+ str(meds.pagina[j])
            lexic_d.append((lex_score,id))
            dense_d.append((dense_score,id))
            colbe_d.append((colb_score,id))
            lex_den_d.append((lex_dense,id))
            lex_den_col_d.append((lex_dense_colb,id))
        except: ## Agregamos un cero porque como es el máximo no tenemos bronca
            lexic_d.append(0)
            dense_d.append(0)
            colbe_d.append(0)
            lex_den_d.append(0)
            lex_den_col_d.append(0)
        ## Como es el máximo por eso el reverse = True    
        lexic_d.sort(reverse = True, key = lambda x: x[0])
        dense_d.sort(reverse = True, key = lambda x: x[0])
        colbe_d.sort(reverse = True, key = lambda x: x[0])
        lex_den_d.sort(reverse = True, key = lambda x: x[0])
        lex_den_col_d.sort(reverse = True, key = lambda x: x[0])
    ##Añadimos ya los ids recuperados
    ids_lexic.append(([k[1] for k in lexic_d[0:10]]))
    ids_dense.append(([k[1] for k in dense_d[0:10]]))
    ids_colbe.append(([k[1] for k in colbe_d[0:10]]))
    ids_recup_lex_den.append(([k[1] for k in lex_den_d[0:10]]))
    ids_recup_lex_den_col.append(([k[1] for k in lex_den_col_d[0:10]]))

100%|████████████████████████████████████████████████████████████████████████████████| 64/64 [1:20:05<00:00, 75.09s/it]


In [12]:
preguntas_1 = preguntas[['id', 'pagina', 'Pregunta']]

In [None]:
preguntas_1['ids_lexic'] = ids_lexic
preguntas_1['ids_dense'] = ids_dense
preguntas_1['ids_colbe'] = ids_colbe
preguntas_1['ids_recup_lex_den'] = ids_recup_lex_den
preguntas_1['ids_recup_lex_den_col'] = ids_recup_lex_den_col

In [17]:
preguntas_1.to_pickle('../Archivos/65q_results_3embs.pkl')

In [2]:
preguntas1 = pd.read_pickle('../Archivos/65q_results_3embs.pkl')

In [3]:
preguntas1

Unnamed: 0,id,pagina,Pregunta,ids_lexic,ids_dense,ids_colbe,ids_recup_lex_den,ids_recup_lex_den_col
0,1025126,PAG_00000029,Quel est le but ultime de l'auteur dans ces le...,"[1025126PAG_00000140, 5400519PAG_00000211, 102...","[1025126PAG_00000019, 1025126PAG_00000040, 102...","[1025126PAG_00000019, 1025126PAG_00000049, 102...","[1025126PAG_00000040, 1025126PAG_00000019, 102...","[1025126PAG_00000019, 1025126PAG_00000040, 102..."
1,1025126,PAG_00000042,Qu'est-ce qui est à l'évidence de laquelle per...,"[1025126PAG_00000042, 1025134PAG_00000082, 540...","[5400517PAG_00000082, 1025168PAG_00000017, 102...","[1025126PAG_00000095, 1025126PAG_00000042, 540...","[1025126PAG_00000042, 1025168PAG_00000017, 540...","[1025126PAG_00000042, 1025126PAG_00000095, 540..."
2,1025126,PAG_00000102,?,"[5400513PAG_00000061, 1025165PAG_00000022, 540...","[5400511PAG_00000139, 5400511PAG_00000004, 540...","[5400511PAG_00000004, 5400511PAG_00000139, 540...","[5400511PAG_00000139, 5400511PAG_00000004, 540...","[5400511PAG_00000004, 5400511PAG_00000139, 540..."
3,1025126,PAG_00000120,Quel est le poids de l'auteur à midi le mercre...,"[1025126PAG_00000126, 1025126PAG_00000121, 102...","[1025126PAG_00000122, 1025126PAG_00000124, 102...","[1025126PAG_00000120, 1025126PAG_00000121, 102...","[1025126PAG_00000122, 1025126PAG_00000121, 102...","[1025126PAG_00000122, 1025126PAG_00000121, 102..."
4,1025126,PAG_00000131,"Quel est le poids total de l'individu à midi, ...","[1025126PAG_00000111, 1025126PAG_00000109, 102...","[1025126PAG_00000118, 1025126PAG_00000113, 102...","[1025126PAG_00000118, 1025126PAG_00000114, 102...","[1025126PAG_00000114, 1025126PAG_00000111, 102...","[1025126PAG_00000118, 1025126PAG_00000114, 102..."
...,...,...,...,...,...,...,...,...
59,5400517,PAG_00000308,Pourquoi le système osseux est-il souvent oubl...,"[5400517PAG_00000308, 5400517PAG_00000309, 540...","[5400517PAG_00000308, 5400517PAG_00000309, 540...","[5400517PAG_00000308, 5400519PAG_00000012, 540...","[5400517PAG_00000308, 5400517PAG_00000309, 540...","[5400517PAG_00000308, 5400517PAG_00000309, 540..."
60,5400517,PAG_00000373,"Quel est le rôle de la nature selon l'auteur, ...","[1025133PAG_00000063, 5400517PAG_00000051, 540...","[5400517PAG_00000379, 5400517PAG_00000051, 540...","[5400517PAG_00000379, 5400517PAG_00000434, 540...","[5400517PAG_00000379, 5400517PAG_00000051, 540...","[5400517PAG_00000379, 5400517PAG_00000434, 540..."
61,5400517,PAG_00000374,?,"[5400513PAG_00000061, 1025165PAG_00000022, 540...","[5400511PAG_00000139, 5400511PAG_00000004, 540...","[5400511PAG_00000004, 5400511PAG_00000139, 540...","[5400511PAG_00000139, 5400511PAG_00000004, 540...","[5400511PAG_00000004, 5400511PAG_00000139, 540..."
62,5400519,PAG_00000042,Quel est le but de l'application d'une autre a...,"[5400519PAG_00000042, 5400519PAG_00000049, 540...","[5400519PAG_00000042, 5400519PAG_00000041, 540...","[5400519PAG_00000042, 5400519PAG_00000047, 540...","[5400519PAG_00000042, 5400519PAG_00000051, 540...","[5400519PAG_00000042, 5400519PAG_00000047, 540..."


# IR Evaluation

Checking weter the first retrieved document is the correct one, or if is not, if the correct document is on the set and in which place.
We need to remember that 65 questions were generated.

## Lexical embeddings Evaluation

In [23]:
ev_lexic = []
for i in range(len(preguntas1)):
    correcto = str(preguntas1.id[i]) + str(preguntas1.pagina[i])
    ids = preguntas1.ids_lexic[i]
    try:
        ev_lexic.append(ids.index(correcto)+1) ## buscamos si esta dentro del conjunto y le sumamos 1 (por aquello de la cuenta en 0)
    except:
        ev_lexic.append('NA') ## Si no está el indice, no lo recupero, contamos los 1, son buenos, y buscamos la media

In [24]:
preguntas1['Eval_Lex'] = ev_lexic

In [1]:
for i in set(ev_lexic):
    print(f'#times that the document is in {i} place: {ev_lexic.count(i)}')

#times that the document is in 1 place: 30
#times that the document is in 2 place: 5
#times that the document is in 3 place: 4
#times that the document is in 4 place: 3
#times that the document is in NA place: 16
#times that the document is in 6 place: 3
#times that the document is in 8 place: 2
#times that the document is in 10 place: 1


# Dense embeddings Evaluation

In [26]:
ev_dense = []
for i in range(len(preguntas1)):
    correcto = str(preguntas1.id[i]) + str(preguntas1.pagina[i])
    ids = preguntas1.ids_dense[i]
    try:
        ev_dense.append(ids.index(correcto)+1) ## buscamos si esta dentro del conjunto y le sumamos 1 (por aquello de la cuenta en 0)
    except:
        ev_dense.append('NA') ## Si no está el indice, no lo recupero, contamos los 1, son buenos, y buscamos la media

In [2]:
for i in set(ev_dense):
    print(f'#times that the document is in {i} place: {ev_dense.count(i)}')

#times that the document is in 1 place: 24
#times that the document is in 2 place: 8
#times that the document is in 3 place: 5
#times that the document is in 4 place: 3
#times that the document is in 5 place: 3
#times that the document is in NA place: 16
#times that the document is in 6 place: 3
#times that the document is in 8 place: 1
#times that the document is in 9 place: 1


In [32]:
preguntas1['Eval_Dense'] = ev_dense

# Colbert Embeddings Evaluation

In [30]:
ev_colb = []
for i in range(len(preguntas1)):
    correcto = str(preguntas1.id[i]) + str(preguntas1.pagina[i])
    ids = preguntas1.ids_colbe[i]
    try:
        ev_colb.append(ids.index(correcto)+1) ## buscamos si esta dentro del conjunto y le sumamos 1 (por aquello de la cuenta en 0)
    except:
        ev_colb.append('NA') ## Si no está el indice, no lo recupero, contamos los 1, son buenos, y buscamos la media

In [3]:
for i in set(ev_colb):
    print(f'Veces que se encontró en lugar {i}: {ev_colb.count(i)}')

#times that the document is in 1 place: 30
#times that the document is in 2 place: 6
#times that the document is in 3 place: 6
#times that the document is in 4 place: 3
#times that the document is in NA place: 15
#times that the document is in 7 place: 1
#times that the document is in 6 place: 1
#times that the document is in 9 place: 1
#times that the document is in 8 place: 1


In [33]:
preguntas1['Eval_Colb'] = ev_colb

# Sparse + Dense score Evaluation

In [34]:
ev_2comb = []
for i in range(len(preguntas1)):
    correcto = str(preguntas1.id[i]) + str(preguntas1.pagina[i])
    ids = preguntas1.ids_recup_lex_den[i]
    try:
        ev_2comb.append(ids.index(correcto)+1) ## buscamos si esta dentro del conjunto y le sumamos 1 (por aquello de la cuenta en 0)
    except:
        ev_2comb.append('NA') ## Si no está el indice, no lo recupero, contamos los 1, son buenos, y buscamos la media

In [4]:
for i in set(ev_2comb):
    print(f'Veces que se encontró en lugar {i}: {ev_2comb.count(i)}')

#times that the document is in 1 place: 32
#times that the document is in 2 place: 12
#times that the document is in 3 place: 3
#times that the document is in NA place: 16
#times that the document is in 6 place: 1


In [36]:
preguntas1['Eval_Lex_Dense'] = ev_2comb

# Evaluacion 3 embs

In [37]:
ev_3embs = []
for i in range(len(preguntas1)):
    correcto = str(preguntas1.id[i]) + str(preguntas1.pagina[i])
    ids = preguntas1.ids_recup_lex_den_col[i]
    try:
        ev_3embs.append(ids.index(correcto)+1) ## buscamos si esta dentro del conjunto y le sumamos 1 (por aquello de la cuenta en 0)
    except:
        ev_3embs.append('NA') ## Si no está el indice, no lo recupero, contamos los 1, son buenos, y buscamos la media

In [5]:
for i in set(ev_3embs):
    print(f'Veces que se encontró en lugar {i}: {ev_3embs.count(i)}')

#times that the document is in 1 place: 31
#times that the document is in 2 place: 13
#times that the document is in 3 place: 3
#times that the document is in NA place: 15
#times that the document is in 6 place: 1
#times that the document is in 10 place: 1


In [39]:
preguntas1['Eval_3embs'] = ev_3embs