Augmentation pipeline

In [65]:
import os
import torch
from mrq.aug import Augmentator, Paraphraser, augment_init_data
from mrq.logger import get_logger

log = get_logger(__name__)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

paraphraser = Paraphraser(device=device)

augs = augment_init_data(
    "aug_init_data.json",
    "test_models_augmentation.json",
    epochs=3,
    paraphraser=paraphraser,
    return_=True,
    num_seq=6
)
augmenter = Augmentator(augs, aug_num=4)
# smoke test
print(augmenter("My very relevant sentence. And another one"))

2023-05-28 19:24:49,163 - mrq.aug - INFO - Initial number of examples: 23
2023-05-28 19:24:49,165 - mrq.aug - INFO - Result number (before set) of examples: 138
2023-05-28 19:24:49,165 - mrq.aug - INFO - Epoch 0 of 1


  0%|          | 0/23 [00:00<?, ?it/s]

['My very relevant sentence.', 'Great lately, enjoy some much needed downtime and explore new hobbies.', 'And another one', "However my world has lately been quite hectic, but fortunately I'm managed to stay focused.", 'It has been stimulating my brain with some interesting projects lately.', 'To be honest, I have been feeling some very stressed, but now I am addressing it.']


In [4]:
import os

import pandas as pd
import torch
from mrq import PROJECT_PATHS
from mrq.algs import GoldenRetriever
from mrq.aug import Augmentator
from mrq.data import AnswerDB, Query, load_data
from mrq.logger import get_logger
from mrq.models import EmbedModel, NERClassifier
from tqdm.auto import tqdm

log = get_logger(__name__)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
batch_len = 10

ner_models = [
    "ukkendane/bert-medical-ner",
    "samrawal/bert-base-uncased_clinical-ner",
    "samrawal/bert-large-uncased_med-ner",
]
emb_models = [
    "emilyalsentzer/Bio_ClinicalBERT",
    "medicalai/ClinicalBERT",
    "pritamdeka/S-Biomed-Roberta-snli-multinli-stsb",
    "menadsa/S-BioELECTRA",
    "TimKond/S-BioLinkBert-MedQuAD",
    "TimKond/S-PubMedBert-MedQuAD",
    "kamalkraj/bioelectra-base-discriminator-pubmed",
]

Data loading

In [None]:
sample_size = 100
seed = 1337

data = load_data("medmcqa", sample_size=sample_size, seed=seed)
data1 = load_data("AnonymousSub/MedQuAD_47441_Question_Answer_Pairs", sample_size=sample_size, seed=seed)

data = data.append(data1)

Additional setup

In [None]:
augmenter = Augmentator(PROJECT_PATHS.data / "test_models_augmentation.json", aug_num=4)


Comparison

It's better to rewrite it with matrix multiplication, but I had to spend time for service itself and it's wrappers, so it's very straight forward implementation with cycles.

## WIP

In [None]:
results_df = pd.DataFrame(columns=["embeddings", "ner", "scores"])

for i_emb, emb in enumerate(emb_models):
    log.info(
        "Starting process for {0} embedding model, {1}/{2}".format(
            emb, i_emb, len(emb_models)
        )
    )
    emb_model = EmbedModel(emb, device=device)

    # data preparation
    augmented_questions = data["Q"].map(augmenter)
    answers = AnswerDB(data["A"].tolist()).encode(
        emb=emb_model, tqdm_flag=True, batch_len=batch_len
    )
    retriever = GoldenRetriever(answers)
    log.info("All answers encoded")
    # emb_model._model.to("cpu")
    # emb_model.device = "cpu"
    for i_ner, ner in enumerate(ner_models):
        log.info("Processing {0} model, {1}/{2}".format(ner, i_ner, len(ner_models)))

        ner_model = NERClassifier(ner, device=device)

        questions_extracted = augmented_questions.map(ner_model.extract).map(
            lambda x: " ".join(x)
        )
        questions = AnswerDB(questions_extracted.tolist()).encode(
            emb=emb_model, tqdm_flag=True, batch_len=batch_len
        )
        # this is bad, but AnswerDB refactoring is required otherwise
        for i_q in tqdm(range(len(questions)), desc="query"):
            q = Query(text=questions.data[i_q], embedded=questions.embedded[i_q])
            res = retriever.find_it(q=q)
            break
        break
    break

Approx pipeline:

1. augment data input 
2. ner classifier
3. query
4. embeddings
5. retrieval model
6. score


retrieval -> sorted indices (+ real answer from the start)-> score -> top_n or mean/median position

In [2]:
import torch

x = torch.tensor([[10,2,12,145,1], [10,2,12,145,1]])
x
torch.topk(x, 3, sorted=True, largest=True)

torch.return_types.topk(
values=tensor([[145,  12,  10],
        [145,  12,  10]]),
indices=tensor([[3, 2, 0],
        [3, 2, 0]]))