<a href="https://colab.research.google.com/github/minhpham0201/MED-MCQA/blob/master/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [1]:
! pip install transformers datasets accelerate -q
! pip install sentence-transformers -q
! pip install deep-translator -q
! pip install faiss-gpu -q

# ! pip install peft -q
# ! pip install bitsandbytes -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [2]:
import torch
import pandas as pd
import tqdm
import numpy as np
import pickle
import datasets
import re
import json

from collections import Counter
import faiss
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, AutoModelForSeq2SeqLM

# Helper Function

In [None]:
# to be deleted
from deep_translator import GoogleTranslator

def translate_text(text):
    translator = GoogleTranslator(source='vi', target='en')
    try:
        trans_text = translator.translate(text)
        return trans_text

    except Exception as e:
        return "Error: " + str(e)

# Use this to map translating only for test hf dataset
def translate_text_google(example):
    question = example['question']
    op1 = example['option_1']
    op2 = example['option_2']
    op3 = example['option_3']
    op4 = example['option_4']

    return {
        'trans_question': translate_text(question),
        'trans_option_1': translate_text(op1),
        'trans_option_2': translate_text(op2),
        'trans_option_3': translate_text(op3),
        'trans_option_4': translate_text(op4)
    }

In [None]:
# to be deleted
from transformers import pipeline

def summarize_text(text,min_length=220, max_length=400):
    summarizer = pipeline("summarization", model='sshleifer/distilbart-cnn-12-6')
    summary = summarizer(text, max_length=max_length, min_length=min_length)
    return summary

In [3]:
def convert_name_NER_batch(examples):
    model_name = 'NlpHUST/ner-vietnamese-electra-base'
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding='longest', max_length=108, truncation=True, use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    ner = pipeline("ner", model=model, tokenizer=tokenizer)

    question_batch = examples['question']
    result_batch = ner(question_batch)

    ner_question_batch = []
    for result, question in zip(result_batch, question_batch):
        i = 1
        person_dict = {}
        for ent in result:
            ent_label = ent['entity']
            ent_word = ent['word']

            if ent_label == 'B-PERSON':
                if ent_word not in person_dict:
                    replace_name = 'Person' + str(i)
                    person_dict[ent_word] = replace_name
                    i += 1
        for k, v in person_dict.items():
            question = question.replace(k, v)

        ner_question_batch.append(question)

    examples['ner_question'] = ner_question_batch

    return examples

In [4]:
def translate_dataset_batch(examples):  # Faster to use in batch
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_name = 'VietAI/envit5-translation'
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    columns = ['ner_question','option_1','option_2','option_3','option_4','option_5']
    translated_dataset = {}

    for col in columns:
        if col == 'ner_question':
            inputs = [f'vi: {text}' for text in examples[col]]

        else:
            inputs = ['vi: ' + re.sub(r'[A-E]\.', '', item).strip() for item in examples[col]]

        inputs = tokenizer(inputs, return_tensors="pt", padding="longest", truncation=True, max_length=108)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Translate the inputs in batches
        with torch.inference_mode():
            outputs = model.generate(**inputs, min_length=0,max_length=108,no_repeat_ngram_size=2)

        # Decode the outputs and remove the special tokens
        translated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        trans_texts = [text.replace('en:','').replace('vi:','').strip() for text in translated_texts]

        translated_dataset[f'trans_{col}'] = trans_texts

    return translated_dataset

In [None]:
# to be deleted when done
import datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def translate_term_for_mapping_batch(examples):
    ########### model AI translate
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_name = 'VietAI/envit5-translation'
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    term_batch = examples['Name']
    term_batch_prepend = [f'vi : {term}' for term in term_batch]

    inputs = tokenizer(term_batch_prepend, return_tensors="pt", padding="longest", truncation=True, max_length=56)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.inference_mode():
        outputs = model.generate(**inputs, min_length=0,max_length=56,no_repeat_ngram_size=2)

    translated_terms = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    trans_terms = [term.replace('en:','').replace('vi:','').strip() for term in translated_terms]

    examples['model_trans_term']= trans_terms

    ########### Google Translate
    ggtrans_terms = []
    for term in term_batch:
        ggtrans_term = translate_text(term)
        ggtrans_terms.append(ggtrans_term)

    examples['gg_trans_term']= ggtrans_terms

    return examples

# Sentence Embedding Corpus

## Process & Save (do not run, just run for set up 1 time)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/merge_paragraphs_break500.csv')
df.head(5)

Unnamed: 0,Url,ItemId,ItemEn,Content,Content_Length
0,alzheimer,355086003,What is Alzheimer's disease?,Alzheimer's is a disease that causes memory lo...,90
1,alzheimer,448457066,History of disease detection.,One of the great mysteries of Alzheimer's dise...,245
2,alzheimer,1046973888,Reason.,The brain is the organ that consumes up to 20%...,366
3,alzheimer,669482261,Who is susceptible to the disease?,The risk of developing Alzheimer's disease inc...,65
4,alzheimer,526999626,Common symptoms of Alzheimer's syndrome.,Our brains are made up of more than 100 billio...,465


In [None]:
df.shape

(5422, 5)

In [None]:
df_half2 = df[2000:3000]

In [None]:
from sentence_transformers import SentenceTransformer, util

model_name = 'sentence-transformers/all-mpnet-base-v2'     # Accuracy model 768 dimension
# model_name = 'sentence-transformers/all-MiniLM-L6-v2'    # Light model 384 dimension

def get_content_embedding(text):
    model = SentenceTransformer(model_name)
    sentence_embedding = model.encode(text)
    return sentence_embedding

In [None]:
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

try:
    df_half2['Content_Embedding'] = df_half2['Content'].progress_apply(get_content_embedding)
except Exception as e:
    print(f"An error occurred: {str(e)}")

In [None]:
with open('/content/drive/MyDrive/df_half2.pkl', 'wb') as file:
    pickle.dump(df_half2, file)

In [None]:
with open('/content/drive/MyDrive/OLD/df_half0.pkl', 'rb') as file:
    df_half0 = pickle.load(file)
with open('/content/drive/MyDrive/OLD/df_half1.pkl', 'rb') as file:
    df_half1 = pickle.load(file)
with open('/content/drive/MyDrive/OLD/df_half2.pkl', 'rb') as file:
    df_half2 = pickle.load(file)
with open('/content/drive/MyDrive/OLD/df_half3.pkl', 'rb') as file:
    df_half3 = pickle.load(file)
with open('/content/drive/MyDrive/OLD/df_half4.pkl', 'rb') as file:
    df_half4 = pickle.load(file)
with open('/content/drive/MyDrive/OLD/df_half5.pkl', 'rb') as file:
    df_half5 = pickle.load(file)

In [None]:
df_final = pd.concat([df_half0, df_half1, df_half2, df_half3, df_half4, df_half5], axis=0)

In [None]:
with open('/content/drive/MyDrive/df_final.pkl', 'wb') as file:
    pickle.dump(df_final, file)

In [None]:
df_768_csv = df_final.drop(columns='Content_Embedding')
df_768_csv.to_csv('/content/drive/MyDrive/df_768.csv')

## Load file embeddings (no need to run)

In [None]:
with open('/content/drive/MyDrive/df_final_768.pkl', 'rb') as file:
    df_final = pickle.load(file)

In [None]:
df_final.shape

(5422, 6)

# Text Retriever

## Save faiss index (do not run, just run when set up 1 time)

In [None]:
import faiss
dimension = 768
index = faiss.IndexFlatL2(dimension)

In [None]:
embeddings = np.vstack(df_final['Content_Embedding'].values)

In [None]:
embeddings.shape

(5422, 768)

In [None]:
index.add(embeddings)

In [None]:
index_file_path = '/content/drive/MyDrive/faiss_768.index'
# Save the index to the specified file
faiss.write_index(index, index_file_path)

## Load faiss index and query

In [None]:
# to be deleted

def get_context_from_question_faiss(question,k,faiss_index_path,trans_corpus_path):
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    model = SentenceTransformer(model_name)

    index = faiss.read_index(faiss_index_path)
    trans_corpus = pd.read_csv(trans_corpus_path)

    xq = model.encode(question)
    xq = np.expand_dims(xq, axis=0)
    D, I = index.search(xq, k)

    return [{trans_corpus['Url'][i]: trans_corpus['Content'][i]} for i in I[0]]

In [None]:
# demo, to be deleted
question = 'What high technology is used to treat hormonal acne?'
get_context_from_question_faiss(question, k=2,faiss_index_path = '/content/drive/MyDrive/faiss_768.index',
                                                trans_corpus_path = '/content/drive/MyDrive/df_768.csv')

In [None]:
# to be deleted
def dataset_get_context(example):
    id = example['id']
    level = id.split('_')[0]
    question = example['trans_ner_question']

    if level == 'level1':
        k=1
    else:
        k=2

    context = get_context_from_question_faiss(question=question, k=k,
                                                    faiss_index_path = '/content/drive/MyDrive/faiss_768.index',
                                                    trans_corpus_path = '/content/drive/MyDrive/df_768.csv')
    concat_context = "\n".join(context) # Concat all text in to long
    example['context'] = concat_context
    return example

In [None]:
# use this for batch faiss encode

faiss_index_path = '/content/drive/MyDrive/faiss_768.index'
trans_corpus_path = '/content/drive/MyDrive/df_768.csv'
index = faiss.read_index(faiss_index_path)
trans_corpus = pd.read_csv(trans_corpus_path)

def dataset_get_context_batch(examples):
    id_batch = examples['id']
    level_batch = [id.split('_')[0] for id in id_batch]
    question_batch = examples['trans_ner_question']

    ## batch sentenceTransformers
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    model = SentenceTransformer(model_name)

    xq_batch = model.encode(question_batch)
    context_batch = []
    for xq, level in zip(xq_batch, level_batch):
        if level == 'level1':
            k=1
        else:
            k=2
        xq = np.expand_dims(xq, axis=0)  # query vector
        D, I = index.search(xq, k)   # D: distance, I: index
        context = [trans_corpus['Content'][i] for i in I[0]]
        concat_context = "\n".join(context) # Concat all text in to long text

        context_batch.append(concat_context)

    examples['context'] = context_batch
    return examples

In [5]:
# use this for batch faiss encode (updated to fetch multiple context)

faiss_index_path = '/content/drive/MyDrive/faiss_768.index'
trans_corpus_path = '/content/drive/MyDrive/df_768.csv'
index = faiss.read_index(faiss_index_path)
trans_corpus = pd.read_csv(trans_corpus_path)

def dataset_get_multi_context_batch(examples):
    question_batch = examples['trans_ner_question']

    # batch sentenceTransformers
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    model = SentenceTransformer(model_name)

    xq_batch = model.encode(question_batch)
    # Create 2 contexts for 1 question
    context_batch_1 = []
    context_batch_2 = []

    for xq in xq_batch:
        xq = np.expand_dims(xq, axis=0)  # query vector
        D, I = index.search(xq, 10)   # D: distance, I: index
        context = [trans_corpus['Content'][i] for i in I[0]]

        ok_context = ''
        ok_batch = []
        for text in context:
            if len(text.split()) <= 350:
                ok_context = ok_context + ' \n' + text
                if len(ok_context.split()) > 350:
                    ok_batch.append(ok_context)
                    ok_context = ''
            else:
                ok_context = ok_context + ' \n' + text
                ok_batch.append(ok_context)
                ok_context = ''

            if len(ok_batch) == 2:
                    break

        context_1 = ok_batch[0]
        context_2 = ok_batch[1]
        context_batch_1.append(context_1)
        context_batch_2.append(context_2)

    examples['context_1'] = context_batch_1
    examples['context_2'] = context_batch_2
    return examples

# Main

In [6]:
test_df = pd.read_csv('/content/drive/MyDrive/public_test.csv')

In [7]:
test_df.head(3)

Unnamed: 0,id,question,option_1,option_2,option_3,option_4,option_5,option_6
0,level3_1,Hương đang mang thai và lo lắng mình có thể gặ...,A. Tuần 10,B.Tuần 20,C. Tuần 30,D. Tuần 40,,
1,level3_2,Hương đang mang thai tuần thứ 5 và lo lắng mìn...,A. 5 tuần,B. 15 tuần,C. 25 tuần,D. 35 tuần,,
2,level3_5,Có bao nhiêu loại rau tiền đạo biết rằng trong...,A. 2,B.3,C. 4,D. 5,,


In [8]:
test_df = test_df.fillna('bầu trời có nhiều mây')

In [9]:
test_df.head(3)

Unnamed: 0,id,question,option_1,option_2,option_3,option_4,option_5,option_6
0,level3_1,Hương đang mang thai và lo lắng mình có thể gặ...,A. Tuần 10,B.Tuần 20,C. Tuần 30,D. Tuần 40,bầu trời có nhiều mây,bầu trời có nhiều mây
1,level3_2,Hương đang mang thai tuần thứ 5 và lo lắng mìn...,A. 5 tuần,B. 15 tuần,C. 25 tuần,D. 35 tuần,bầu trời có nhiều mây,bầu trời có nhiều mây
2,level3_5,Có bao nhiêu loại rau tiền đạo biết rằng trong...,A. 2,B.3,C. 4,D. 5,bầu trời có nhiều mây,bầu trời có nhiều mây


In [10]:
test_hf = datasets.Dataset.from_pandas(test_df)

# Convert NER Person

In [11]:
%%time
test_ner = test_hf.map(convert_name_NER_batch, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/411k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/532M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


CPU times: user 33.1 s, sys: 844 ms, total: 33.9 s
Wall time: 15.2 s


# Translate dataset

## Translate & Process Test set (by Google)

In [None]:
%%time
test_ggtrans = test.map(translate_text_google,num_proc=4)

NameError: ignored

In [None]:
%%time
test_ggtrans_context = test_ggtrans.map(dataset_get_context)

In [None]:
test_ggtrans_context

In [None]:
pd.DataFrame(test_ggtrans_context).head(2)

## Translate & Process (Using translation Model)

In [12]:
%%time
test_ner_trans = test_ner.map(translate_dataset_batch, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.49M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

CPU times: user 14.7 s, sys: 3.13 s, total: 17.9 s
Wall time: 29.7 s


In [None]:
pd.DataFrame(test_ner_trans).head(2)

Unnamed: 0,id,question,option_1,option_2,option_3,option_4,option_5,option_6,ner_question,trans_ner_question,trans_option_1,trans_option_2,trans_option_3,trans_option_4,trans_option_5
0,level3_1,Hương đang mang thai và lo lắng mình có thể gặ...,A. Tuần 10,B.Tuần 20,C. Tuần 30,D. Tuần 40,bầu trời có nhiều mây,bầu trời có nhiều mây,Person1 đang mang thai và lo lắng mình có thể ...,Person1 is pregnant and is worried about the p...,Week 10,Week 20,30th,Week 40,Cloudy sky
1,level3_2,Hương đang mang thai tuần thứ 5 và lo lắng mìn...,A. 5 tuần,B. 15 tuần,C. 25 tuần,D. 35 tuần,bầu trời có nhiều mây,bầu trời có nhiều mây,Person1 đang mang thai tuần thứ 5 và lo lắng m...,Person1 is in her fifth week of pregnancy and ...,5 weeks,15 Weeks,25 Weeks,35 weeks,Cloudy sky


# Term Mapping

## Prepare term mapping dictionary

In [None]:
term_df = pd.read_csv('/content/strongs.csv')
term_df_hf = datasets.Dataset.from_pandas(term_df)

In [None]:
term_df_hf = term_df_hf.map(translate_term_for_mapping_batch, batched=True)

Map:   0%|          | 0/603 [00:00<?, ? examples/s]

In [None]:
pd.DataFrame(term_df_hf).to_csv('/content/drive/MyDrive/term_mapping.csv')

In [None]:
# convert to dict
term_dict = {}
for _, row in term_df.iterrows():
    wrong_term = row['model_trans_term']
    target_term = row['gg_trans_term']

    if wrong_term not in term_dict:
        term_dict[wrong_term] = target_term

In [None]:
with open('/content/drive/MyDrive/term_dict.json', 'w', encoding ='utf8') as json_file:
    json.dump(term_dict, json_file)

## Load Term Mapping & Apply To Question



In [13]:
term_dict_path = '/content/drive/MyDrive/term_dict.json'
with open(term_dict_path) as f:
    term_dict = json.load(f)
    f.close()

def to_map(texts):
    mapped_texts = []
    for text in texts:
        for k,v in term_dict.items():
            text = text.replace(k,v)
        mapped_texts.append(text)
    return mapped_texts

def mapping_term_batch(examples):
    examples['trans_ner_question'] = to_map(examples['trans_ner_question'])
    examples['trans_option_1'] = to_map(examples['trans_option_1'])
    examples['trans_option_2'] = to_map(examples['trans_option_2'])
    examples['trans_option_3'] = to_map(examples['trans_option_3'])
    examples['trans_option_4'] = to_map(examples['trans_option_4'])
    examples['trans_option_5'] = to_map(examples['trans_option_5'])

    return examples

In [14]:
%%time
test_ner_trans_term = test_ner_trans.map(mapping_term_batch, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

CPU times: user 76.8 ms, sys: 791 µs, total: 77.6 ms
Wall time: 76.3 ms


# Get Context For Dataset

In [15]:
%%time
test_ner_trans_term_context = test_ner_trans_term.map(dataset_get_multi_context_batch, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Downloading (…)99753/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)0cdb299753/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)db299753/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)753/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)99753/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)9753/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)0cdb299753/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b299753/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

CPU times: user 3.31 s, sys: 873 ms, total: 4.18 s
Wall time: 11.8 s


In [None]:
pd.DataFrame(test_ner_trans_term_context)[['trans_ner_question','context_1','context_2']]

Unnamed: 0,trans_ner_question,context_1,context_2
0,Person1 is pregnant and is worried about the p...,\nDoctor Nguyen Duc Hinh said that most cases...,"\nUsually, the diagnosis is made using ultras..."
1,Person1 is in her fifth week of pregnancy and ...,\nTo diagnose uterine prolapse during pregnan...,\nDoctor Nguyen Duc Hinh said that most cases...
2,How many of these are forward vegetables that ...,\nPeople living in environments where lung fl...,\nNewborns with undescended testicles will co...
3,A patient with Person1 was diagnosed with hepa...,\nHepatitis B is classified into two types: a...,\nAcute hepatitis B is characterized by the p...
4,A patient presented with testicular pain. Afte...,\nLeft testicle pain comes from many differen...,\nMost cases of right testicular pain can be ...
...,...,...,...
95,Person1 Sugar is a famous female singer in Chi...,\nVoice is very important in each of our live...,\nLaryngitis often comes suddenly and becomes...
96,The morning is now 9 months pregnant. There ar...,\nDr. Huynh Van Trung answers some other freq...,\nPregnant women often have vaginal discharge...
97,Mr. Person1 is 73 years old. At a family dinne...,\nThis disease can affect almost any part of ...,"\nIn cases where hand, foot and mouth disease..."
98,Brain tumorss are neoplasms that form in the c...,\nDefinition: A benign brain tumor is a type ...,\nThere are more than 130 different types of ...


# Deploy Zero Shot Model

In [16]:
# remember to delete 'accepted_prompt' when testing complete!!!!!!!!

def create_prompt_zero_shot_model(examples):
    question_batch = examples['trans_ner_question']
    context_batch_1 = examples['context_1']
    context_batch_2 = examples['context_2']
    prompt_batch_1 = [f'Question: {question} [SEP] Answer question given Context: {context}' for question, context in zip(question_batch, context_batch_1)]
    prompt_batch_2 = [f'Question: {question} [SEP] Answer question given Context: {context}' for question, context in zip(question_batch, context_batch_2)]

    return {
            'prompt_1': prompt_batch_1,
            'prompt_2': prompt_batch_2
            }

In [17]:
%%time
test_ner_trans_term_context_prompt = test_ner_trans_term_context.map(create_prompt_zero_shot_model,batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

CPU times: user 24.5 ms, sys: 4.82 ms, total: 29.3 ms
Wall time: 28 ms


In [None]:
# MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7  --- 270M params
# MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli   --- 435M params

classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", device=0, use_fast=True)

def get_binary_answer(choices: list, answers: list):
    length = len(choices)
    result = ['0']*length
    for answer in answers:
        for i in range(len(choices)):
            if answer == choices[i]:
                result[i] = '1'
    return ''.join(result)


def get_prediction(example):
    prompt = example['prompt']

    id = example['id']
    choices = [example['trans_option_1'], example['trans_option_2'],
               example['trans_option_3'], example['trans_option_4'], example['trans_option_5']]

    choices = [item for item in choices if item != 'Cloudy sky']

    predicted_labels = []
    threshold = 0.85

    results = classifier(prompt, choices, multi_label=True)
    scores = results['scores']
    labels = results['labels']

    highest_score_index = np.argmax(scores)
    my_answers = [labels[highest_score_index]]

    # Include labels with scores above the threshold
    for i in range(len(labels)):
        if (scores[i] >= threshold) and (labels[i] not in my_answers):
            my_answers.append(labels[i])

    binary_answer = get_binary_answer(choices, my_answers)
    binary_answer = f'{id},{binary_answer}'
    example['binary_answer'] = binary_answer

    return example

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [None]:
# Get prediction using multiple context
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", device=0, use_fast=True)

def get_binary_answer(choices: list, answers: list):
    length = len(choices)
    result = ['0']*length
    for answer in answers:
        for i in range(len(choices)):
            if answer == choices[i]:
                result[i] = '1'
    return ''.join(result)


def get_multi_context_prediction(example):
    prompt_1 = example['prompt_1']
    prompt_2 = example['prompt_2']
    id = example['id']
    choices = [example['trans_option_1'], example['trans_option_2'],
               example['trans_option_3'], example['trans_option_4'], example['trans_option_5']]

    choices = [item for item in choices if item != 'Cloudy sky']

    threshold = 0.85

    results = classifier([prompt_1, prompt_2], choices, multi_label=True)
    scores_1, labels_1 = results[0]['scores'], results[0]['labels']
    scores_2, labels_2 = results[1]['scores'], results[1]['labels']

    max_score_1_index = np.argmax(scores_1)
    my_answers_1 = [labels_1[max_score_1_index]]

    max_score_2_index = np.argmax(scores_2)
    my_answers_2 = [labels_2[max_score_2_index]]

    # Include labels with scores above the threshold
    for i in range(len(choices)):
        if (scores_1[i] >= threshold) and (labels_1[i] not in my_answers_1):
            my_answers_1.append(labels_1[i])

        if (scores_2[i] >= threshold) and (labels_2[i] not in my_answers_2):
            my_answers_2.append(labels_2[i])

    binary_answer_1 = get_binary_answer(choices, my_answers_1)
    binary_answer_2 = get_binary_answer(choices, my_answers_2)

    binary_answer_1 = f'{id},{binary_answer_1}'
    binary_answer_2 = f'{id},{binary_answer_2}'
    example['binary_answer_1'] = binary_answer_1
    example['binary_answer_2'] = binary_answer_2
    return example

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [None]:
prediction = test_ner_trans_term_context_prompt.map(get_multi_context_prediction)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



In [None]:
prediction = test_ner_trans_term_context_prompt.map(get_prediction)

In [None]:
pd.DataFrame(prediction)[['binary_answer_1','binary_answer_2']]

In [None]:
prediction = pd.DataFrame(prediction)

In [None]:
df.to_csv('prediction2.csv')

# Ensemble Method

## Download Model to Disk to run offline

In [48]:
from transformers import AutoModelForSequenceClassification

model5 = AutoModelForSequenceClassification.from_pretrained("vicgalle/xlm-roberta-large-xnli-anli")
tokenizer5 = AutoTokenizer.from_pretrained('vicgalle/xlm-roberta-large-xnli-anli')

model5.save_pretrained("/content/drive/MyDrive/Zero Shot Model/xlm-roberta-large-xnli-anli")
tokenizer5.save_pretrained("/content/drive/MyDrive/Zero Shot Model/xlm-roberta-large-xnli-anli")

Downloading (…)lve/main/config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

('/content/drive/MyDrive/Zero Shot Model/xlm-roberta-large-xnli-anli/tokenizer_config.json',
 '/content/drive/MyDrive/Zero Shot Model/xlm-roberta-large-xnli-anli/special_tokens_map.json',
 '/content/drive/MyDrive/Zero Shot Model/xlm-roberta-large-xnli-anli/sentencepiece.bpe.model',
 '/content/drive/MyDrive/Zero Shot Model/xlm-roberta-large-xnli-anli/added_tokens.json',
 '/content/drive/MyDrive/Zero Shot Model/xlm-roberta-large-xnli-anli/tokenizer.json')

In [54]:
classifier1 = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0, use_fast=True)
classifier2 = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", device=0, use_fast=True)
classifier3 = pipeline("zero-shot-classification", model="alexandrainst/scandi-nli-large", device=0, use_fast=True)
classifier4 = pipeline("zero-shot-classification", model="cross-encoder/nli-deberta-v3-large", device=0, use_fast=True)
classifier5 = pipeline("zero-shot-classification", model="vicgalle/xlm-roberta-large-xnli-anli", device=0, use_fast=True)
classifier6 = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli", device=0, use_fast=True)

classifiers = [classifier1, classifier2, classifier3, classifier4, classifier5, classifier6]



OutOfMemoryError: ignored

In [52]:
def get_binary_answer(choices: list, answers: list):
    length = len(choices)
    result = ['0']*length
    for answer in answers:
        for i in range(len(choices)):
            if answer == choices[i]:
                result[i] = '1'
    return ''.join(result)

def ensemble_binary(scores: list, vote=2):
    length = len(scores[0])
    answer = ['0']*length
    for i in range(len(scores[0])):
        count = 0
        for j in range(len(scores)):
          if scores[j][i] == '1':
            count += 1
          if count == vote:
            answer[i] = '1'
    return ''.join(answer)

def final_ensemble(scores):
    length = len(scores[0])
    answer = ['0']*length
    for i in range(len(scores[0])):
        for j in range(len(scores)):
          if scores[j][i] == '1':
            answer[i] = '1'
            break
    return ''.join(answer)

def get_multi_context_prediction_ensemble(example):
    prompt_1 = example['prompt_1']
    prompt_2 = example['prompt_2']
    id = example['id']
    choices = [example['trans_option_1'], example['trans_option_2'],
               example['trans_option_3'], example['trans_option_4'], example['trans_option_5']]

    choices = [item for item in choices if item != 'Cloudy sky']

    threshold = 0.8
    context1_for_ensemble = [] # result will be as binary : ['1001', '1001','1000',.....,'0001'] then used to merge into final answer
    context2_for_ensemble = []

    for classifier in classifiers:
        results = classifier([prompt_1, prompt_2], choices, multi_label=True)
        context1_scores, context1_labels = results[0]['scores'], results[0]['labels']
        context2_scores, context2_labels = results[1]['scores'], results[1]['labels']

        candidate1_answer = []
        candidate2_answer = []
        if np.max(context1_scores) >= threshold: # make sure max score > threshold
            max_score_1_index = np.argmax(context1_scores)
            candidate1_answer.append(context1_labels[max_score_1_index])

        if np.max(context2_scores) >= threshold: # make sure max score > threshold
            max_score_2_index = np.argmax(context2_scores)
            candidate2_answer.append(context2_labels[max_score_2_index])

        # Include labels with scores above the threshold
        for i in range(len(choices)):
            if (context1_scores[i] >= threshold) and (context1_labels[i] not in candidate1_answer):
                candidate1_answer.append(context1_labels[i])

            if (context2_scores[i] >= threshold) and (context2_labels[i] not in candidate2_answer):
                candidate2_answer.append(context2_labels[i])

        bin_candidate1_answer = get_binary_answer(choices, candidate1_answer)
        bin_candidate2_answer = get_binary_answer(choices, candidate2_answer)

        context1_for_ensemble.append(bin_candidate1_answer)
        context2_for_ensemble.append(bin_candidate2_answer)

    # Begin to ensemble answer
    vote = 2
    context1_final_answer = ensemble_binary(context1_for_ensemble, vote=vote)
    context2_final_answer = ensemble_binary(context2_for_ensemble, vote=vote)

    # Final ensemble
    final_answer = final_ensemble([context1_final_answer, context2_final_answer])
    binary_final_answer = f'{id},{final_answer}'

    example['context1_answer'] = context1_final_answer
    example['context2_answer'] = context2_final_answer
    example['binary_final_answer'] = binary_final_answer

    return example

In [53]:
prediction = test_ner_trans_term_context_prompt.map(get_multi_context_prediction_ensemble)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: ignored

In [50]:
prediction_df = pd.DataFrame(prediction)

NameError: ignored

In [None]:
prediction_df.sample(5)

Unnamed: 0,id,question,option_1,option_2,option_3,option_4,option_5,option_6,ner_question,trans_ner_question,...,trans_option_3,trans_option_4,trans_option_5,context_1,context_2,prompt_1,prompt_2,context1_answer,context2_answer,binary_final_answer
11,level3_55,Bác sĩ chuẩn đoán Lan bị mắc bệnh u não. Sau k...,A. Có,B. Không,C. Không rõ,bầu trời có nhiều mây,bầu trời có nhiều mây,bầu trời có nhiều mây,Bác sĩ chuẩn đoán Person1 bị mắc bệnh u não. S...,The doctor diagnoses Person1 as brain tumor. A...,...,Unknown,Cloudy sky,Cloudy sky,\nBrain tumors in different brain regions wil...,\nThe process of diagnosing benign brain tumo...,Question: The doctor diagnoses Person1 as brai...,Question: The doctor diagnoses Person1 as brai...,0,101,"level3_55,101"
44,level1_54,Các biến chứng của suy tim phải bao gồm những ...,A. Rung nhĩ,B.Suy tim,C. Các vấn đề về van tim,D. Suy giảm chức năng thận,E. Tổn thương gan,bầu trời có nhiều mây,Các biến chứng của suy tim phải bao gồm những ...,What should complications of heart failure inc...,...,Valvular problems,Impaired kidney function,Liver damage,\nComplications become more severe in the lat...,"\nBefore a patient with heart failure, it is ...",Question: What should complications of heart f...,Question: What should complications of heart f...,11111,1111,"level1_54,11111"
20,level1_38,Bác sĩ có thể chỉ định người bệnh thực hiện mộ...,A. Chụp X-quang,B.Chụp MRI,C. Siêu âm,bầu trời có nhiều mây,bầu trời có nhiều mây,bầu trời có nhiều mây,Bác sĩ có thể chỉ định người bệnh thực hiện mộ...,Your doctor may order imaging tests such as:?,...,Ultrasound,Cloudy sky,Cloudy sky,"\nFirst, the doctor will examine and ask abou...",\nParaclinical examinations such as diagnosti...,Question: Your doctor may order imaging tests ...,Question: Your doctor may order imaging tests ...,1,100,"level1_38,101"
58,level1_106,Làm thế nào để phòng ngừa bệnh viêm thanh quản?.,A. Tránh hút thuốc và tránh xa khói thuốc,B.Uống nhiều nước,C. Tránh ăn khuya,bầu trời có nhiều mây,bầu trời có nhiều mây,bầu trời có nhiều mây,Làm thế nào để phòng ngừa bệnh viêm thanh quản?.,How to prevent laryngitis.,...,Avoid Late Eating,Cloudy sky,Cloudy sky,"\nTo prevent laryngitis, you need to follow t...",\nLaryngeal cancer is a dangerous disease but...,Question: How to prevent laryngitis. [SEP] Ans...,Question: How to prevent laryngitis. [SEP] Ans...,111,111,"level1_106,111"
92,level2_38,"Triệu chứng chung của suy tim, suy hô hấp, suy...",Sưng chân và mắt cá chân,"Ngón tay, ngón chân và môi xanh xao, nhợt nhạt",Dạ dày,Khó thở,bầu trời có nhiều mây,bầu trời có nhiều mây,"Triệu chứng chung của suy tim, suy hô hấp, suy...","General symptoms of heart failure, respiratory...",...,Stomach,Shortness of breath,Cloudy sky,\nSymptoms of the disease may be different fo...,\nHeart failure is one of the common causes o...,"Question: General symptoms of heart failure, r...","Question: General symptoms of heart failure, r...",1001,1011,"level2_38,1011"


# Testing different Classifier

In [49]:
classifier1 = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0, use_fast=True)
classifier2 = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli", device=0, use_fast=True)
classifier3 = pipeline("zero-shot-classification", model="alexandrainst/scandi-nli-large", device=0, use_fast=True)
classifier4 = pipeline("zero-shot-classification", model="cross-encoder/nli-deberta-v3-large", device=0, use_fast=True)
classifier5 = pipeline("zero-shot-classification", model="vicgalle/xlm-roberta-large-xnli-anli", device=0, use_fast=True)
classifier6 = pipeline("zero-shot-classification", model="NbAiLab/nb-bert-base-mnli", device=0, use_fast=True)

classifiers = [classifier1, classifier2, classifier3, classifier4, classifier5, classifier6]



Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/312 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [24]:
# function for 1 context
def get_prompt_answer(df, index):
    row = df.iloc[index]
    prompt = row['prompt']
    option1 = row['trans_option_1']
    option2 = row['trans_option_2']
    option3 = row['trans_option_3']
    option4 = row['trans_option_4']
    option5 = row['trans_option_5']
    choices = [option1, option2, option3, option4, option5]
    choices = [item for item in choices if item != 'Cloudy sky']

    return prompt, choices


prompt, choices = get_prompt_answer(prediction, 95)

results1 = classifier1(prompt, choices, multi_label=True)
results2 = classifier2(prompt, choices, multi_label=True)
results3 = classifier3(prompt, choices, multi_label=True)
results4 = classifier4(prompt, choices, multi_label=True)
results5 = classifier5(prompt, choices, multi_label=True)
results6 = classifier6(prompt, choices, multi_label=True)

print(results1['sequence'])

print(results1['labels'], results1['scores'])
print(results2['labels'], results2['scores'])
print(results3['labels'], results3['scores'])
print(results4['labels'], results4['scores'])
print(results5['labels'], results5['scores'])
print(results6['labels'], results6['scores'])

NameError: ignored

In [None]:
function for multi context
def get_prompt_answer_multi_context(df, index):
    row = df.iloc[index]
    prompt1 = row['prompt_1']
    prompt2 = row['prompt_2']
    option1 = row['trans_option_1']
    option2 = row['trans_option_2']
    option3 = row['trans_option_3']
    option4 = row['trans_option_4']
    option5 = row['trans_option_5']
    choices = [option1, option2, option3, option4, option5]
    choices = [item for item in choices if item != 'Cloudy sky']
    prompts = [prompt1, prompt2]
    return prompts, choices

prompts, choices = get_prompt_answer_multi_context(prediction, 95)

results1 = classifier1(prompts, choices, multi_label=True)
results2 = classifier2(prompts, choices, multi_label=True)
results3 = classifier3(prompts, choices, multi_label=True)
results4 = classifier4(prompts, choices, multi_label=True)
results5 = classifier5(prompts, choices, multi_label=True)
results6 = classifier6(prompts, choices, multi_label=True)

print('Prompting')
print(results1[0]['sequence'])
print(results1[1]['sequence'])
print('----------------------')

print(results1[0]['labels'], results1[0]['scores'])
print(results1[1]['labels'], results1[1]['scores'])
print('########')
print(results2[0]['labels'], results2[0]['scores'])
print(results2[1]['labels'], results2[1]['scores'])
print('########')
print(results3[0]['labels'], results3[0]['scores'])
print(results3[1]['labels'], results3[1]['scores'])
print('########')
print(results4[0]['labels'], results4[0]['scores'])
print(results4[1]['labels'], results4[1]['scores'])
print('########')
print(results5[0]['labels'], results5[0]['scores'])
print(results5[1]['labels'], results5[1]['scores'])
print('########')
print(results6[0]['labels'], results6[0]['scores'])
print(results6[1]['labels'], results6[1]['scores'])
print('########')

Prompting
Question: Person1 Sugar is a famous female singer in China. Recently her voice has deteriorated drastically due to uncomfortable symptoms such as sore throat, throat burning and difficulty in swallowing. What could Person 1 Sugar have suffered from? [SEP] Answer question given Context:  
Voice is very important in each of our lives. If you know how to protect your voice, you will prevent voice disorders and keep your voice sweet and clear. Dr. Thuy Hang recommends that you do the following to protect your voice:Avoid overusing your voice: Limit shouting, stop the habit of clearing your throat, and only use your voice when necessary. Quit smoking. Smoking not only harms vocal cord tissue but also increases the risk of dangerous cancers. Always keep your throat moist by drinking lots of water and fruit juice. Be sure to limit caffeine and alcohol because these drinks can make you more susceptible to dehydration. Try to adjust your home and workplace space to not be too dry, ide

In [None]:
# nếu score của classifier1 ko có score nào cao thì sẽ retrieve đoạn text khác

# Multiple Choice Model

In [None]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice
import torch

model_name = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load your fine-tuned model
model = AutoModelForMultipleChoice.from_pretrained("/content/drive/MyDrive/modelMC/checkpoint-2500")

In [None]:
def predict_example(example):
    question = example['question']
    context = example['context']
    opa = example['trans_option_1']
    opb = example['trans_option_2']
    opc = example['trans_option_3']
    opd = example['trans_option_4']

    if opd != '':
        n_choice = 4
    elif opc != '':
        n_choice = 3
    elif opb != '':
        n_choice = 2

    options = [opa, opb, opc, opd][:n_choice]
    input = [f'question: {question}, answer: {option}, context: {context}' for option in [opa, opb, opc, opd][:n_choice]]

    tokenized = tokenizer(input, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = tokenized['input_ids'].unsqueeze(dim=0)
    attention_mask = tokenized['attention_mask'].unsqueeze(dim=0)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    predicted_labels = torch.argmax(outputs.logits, dim=1)
    example['predicted_labels'] = predicted_labels.tolist()

    return example

prediction = test_ggtrans_context.map(predict_example)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
pd.DataFrame(prediction).head(5)

NameError: ignored

# Testing