# Loading and Preprocessing data

In [1]:
import os
import json

In [2]:
# ViNewsQA_path = "/content/drive/MyDrive/UIT-ViIR-ViNewsQA-6000"
# os.chdir(ViNewsQA_path)

In [3]:
ViQuAD_path = "/content/drive/MyDrive/UIT-ViIR-ViQuAD-6000/UIT-ViIR-ViQuAD"
os.chdir(ViQuAD_path)

In [4]:
questions = json.load(open('questions.json', encoding="utf-8"))
retrieved_results = json.load(open('retrieved_results.json', encoding="utf-8"))
texts = json.load(open('texts.json', encoding="utf-8"))

In [5]:
%cd "/content"
!pip3 install -q vncorenlp
!mkdir -p vncorenlp/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter

/content
[K     |████████████████████████████████| 2.7MB 6.9MB/s 
[?25h  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone
--2021-06-25 01:07:45--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2021-06-25 01:07:45 (84.3 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2021-06-25 01:07:45--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.

In [6]:
from vncorenlp import VnCoreNLP
from nltk import flatten

root_path = os.getcwd()
VnCoreNLP_path = root_path + "/vncorenlp/VnCoreNLP-1.1.1.jar"
rdrsegmenter = VnCoreNLP(
    VnCoreNLP_path, annotators="wseg", max_heap_size="-Xmx500m")


def tokenize(sentence):
    return ' '.join(i for i in flatten(rdrsegmenter.tokenize(sentence)))

In [7]:
import string


def tokenizer(text):
    tokenized_doc = []
    text = tokenize(text)
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0:
            tokenized_doc.append(token)
    return tokenized_doc

In [8]:
from tqdm.auto import tqdm

tokenized_corpus = []
for text in tqdm(list(texts.values())):
    tokenized_corpus.append(tokenizer(text))

HBox(children=(FloatProgress(value=0.0, max=1441.0), HTML(value='')))




# TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizor = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word',
                                   ngram_range=(1, 1), max_features=5000,
                                   norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
                                   max_df=0.95, min_df=2)
X = tfidf_vectorizor.fit_transform(
    [" ".join(sent) for sent in tokenized_corpus])

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def search(query, top_k):
    query_vec = tfidf_vectorizor.transform([query])
    cosine_similarities = cosine_similarity(X, query_vec).flatten()
    most_similar_doc_indices = np.argsort(
        cosine_similarities, axis=0)[:-top_k-1:-1]
    results = []
    for index in most_similar_doc_indices:
        results.append(list(texts.values())[index])
    return results

In [11]:
MRR = 0
for query_idx in tqdm(list(questions.keys())):
    retrieved_answers = search(
        questions[query_idx], top_k=len(list(questions.keys())))
    if texts[retrieved_results[query_idx]] in retrieved_answers:
        MRR += (1 /
                (retrieved_answers.index(texts[retrieved_results[query_idx]]) + 1))
print(f"Model: TF-IDF MRR: {round(MRR / len(list(questions.keys())), 5)}")

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: TF-IDF MRR: 0.29417


In [12]:
for top_k in tqdm([1, 5, 10]):
    RECALL = 0
    for query_idx in tqdm(list(questions.keys())):
        retrieved_answers = search(questions[query_idx], top_k=top_k)
        if texts[retrieved_results[query_idx]] in retrieved_answers:
            RECALL += 1
    print(
        f"Model: IF-IDF Recall@{top_k}: {round(RECALL / len(list(questions.keys())), 5)}")

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: IF-IDF Recall@1: 0.21195


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: IF-IDF Recall@5: 0.3785


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: IF-IDF Recall@10: 0.44893



# BM25

In [9]:
!pip3 install -q rank_bm25
from rank_bm25 import BM25Okapi, BM25L, BM25Plus

bm25 = BM25Okapi(tokenized_corpus)
bm25l = BM25L(tokenized_corpus)
bm25plus = BM25Plus(tokenized_corpus)

In [10]:
import numpy as np


def search(model, query, top_k):
    scores = model.get_scores(tokenizer(query))
    top_n = np.argpartition(scores, -top_k)[-top_k:]
    bm25_hits = [{'corpus_id': idx, 'score': scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    results = []
    for hit in bm25_hits[0:top_k]:
        results.append(list(texts.values())[hit['corpus_id']])
    return results

In [None]:
models = [bm25, bm25l, bm25plus]
model_names = ["Okapi BM25", "BM25L", "BM25+"]
for model, model_name in tqdm(zip(models, model_names)):
    MRR = 0
    for query_idx in tqdm(list(questions.keys())):
        retrieved_answers = search(
            model, questions[query_idx], top_k=1000)
        if texts[retrieved_results[query_idx]] in retrieved_answers:
            MRR += (1 /
                    (retrieved_answers.index(texts[retrieved_results[query_idx]]) + 1))
    print(
        f"Model: {model_name} MRR: {round(MRR / len(list(questions.keys())), 5)}")

In [17]:
for model, model_name in tqdm(zip(models, model_names)):
    for top_k in tqdm([1, 5, 10]):
        RECALL = 0
        for query_idx in tqdm(list(questions.keys())):
            retrieved_answers = search(
                model, questions[query_idx], top_k=top_k)
            if texts[retrieved_results[query_idx]] in retrieved_answers:
                RECALL += 1
        print(
            f"Model: {model_name} Recall@{top_k}: {round(RECALL / len(list(questions.keys())), 5)}")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: Okapi BM25 Recall@1: 0.65454


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: Okapi BM25 Recall@5: 0.81192


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: Okapi BM25 Recall@10: 0.85397



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: BM25L Recall@1: 0.35681


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: BM25L Recall@5: 0.61816


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: BM25L Recall@10: 0.70895



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: BM25+ Recall@1: 0.67223


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: BM25+ Recall@5: 0.82644


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: BM25+ Recall@10: 0.87066




In [24]:
top_k = 1
query_idx = list(questions.keys())[1526]
query = questions[query_idx]
print("Query: ", query)
retrieved_answers = search(bm25plus, query, top_k=top_k)
true_answer = texts[retrieved_results[query_idx]]

print("\nRetrived answers: ")
for retrieved_answer in retrieved_answers:
    print(retrieved_answer)
print(f"\n\nTrue answers:\n{true_answer}")

Query:  Tổng diện tích mặt sàn của thư viện Arsenal là bao nhiêu?

Retrived answers: 
Thư viện Arsenal hiện nay có tổng diện tích sàn khoảng 10 nghìn mét vuông, trong đó 7.484 mét vuông sử dụng. Đón tiếp một số lượng độc giả không lớn, thư viện chỉ có 119 mét vuông cho phòng đọc với 48 chỗ ngồi. Công chúng độc giả của thư viện Arsenal phần đông là sinh viên hoặc giới nghiên cứu, cùng với khoảng 10% là giới hưu trí. Tổng cộng, mỗi năm thư viện đón tiếp khoảng từ 18 đến 19 nghìn độc giả. Bộ sưu tập tài liệu của thư viện vẫn chủ yếu về hai lĩnh vực lịch sử và văn học, trong đó có một số lượng lớn sách in, được phần chia thành trước và sau năm 1880, khoảng 12 nghìn bản viết tay từ thời Trung Cổ cho tới ngày nay, khoảng 100 nghìn bản in gồm cả chân dung, tranh biếm họa, bản đồ, cuối cùng, một lượng lớn những bản chép nhạc và các tạp chí.


True answers:
Thư viện Arsenal hiện nay có tổng diện tích sàn khoảng 10 nghìn mét vuông, trong đó 7.484 mét vuông sử dụng. Đón tiếp một số lượng độc giả 

# BERT

In [9]:
!pip3 install -q sentence-transformers
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util


def search(query, top_k, model_name, cross_encoder_flag, bi_encoder, cross_encoder):
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(
        question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]
    if cross_encoder_flag == True:
        cross_inp = [[query, list(texts.values())[hit['corpus_id']]]
                     for hit in hits]
        cross_scores = cross_encoder.predict(cross_inp)
        for idx in range(len(cross_scores)):
            hits[idx]['cross-score'] = cross_scores[idx]
        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    else:
        hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    results = []
    for hit in hits[0:top_k]:
        results.append(list(texts.values())[hit['corpus_id']])
    return results

[K     |████████████████████████████████| 92kB 6.0MB/s 
[K     |████████████████████████████████| 2.5MB 14.2MB/s 
[K     |████████████████████████████████| 1.2MB 53.5MB/s 
[K     |████████████████████████████████| 901kB 51.0MB/s 
[K     |████████████████████████████████| 3.3MB 30.3MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [13]:
model_names = ["distiluse-base-multilingual-cased-v2",
               "paraphrase-multilingual-MiniLM-L12-v2"]
cross_encoder_flags = [False, True]

In [14]:
for model_name in tqdm(model_names):
    bi_encoder = SentenceTransformer(model_name)
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
    corpus_embeddings = bi_encoder.encode([" ".join(
        sent) for sent in tokenized_corpus], convert_to_tensor=True, show_progress_bar=True)
    for cross_encoder_flag in tqdm(cross_encoder_flags):
        for top_k in tqdm([1, 5, 10]):
            RECALL = 0
            for query_idx in tqdm(list(questions.keys())):
                retrieved_answers = search(questions[query_idx], top_k=top_k, model_name=model_name, cross_encoder_flag=cross_encoder_flag, bi_encoder=bi_encoder, cross_encoder=cross_encoder)
                if texts[retrieved_results[query_idx]] in retrieved_answers:
                    RECALL += 1
            print(
                f"Model: {model_name} Cross-Encoder: {cross_encoder_flag} Recall@{top_k}: {round(RECALL / len(list(questions.keys())), 5)}")

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Batches', max=38.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: distiluse-base-multilingual-cased-v2 Cross-Encoder: False Recall@1: 0.18558


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: distiluse-base-multilingual-cased-v2 Cross-Encoder: False Recall@5: 0.33027


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: distiluse-base-multilingual-cased-v2 Cross-Encoder: False Recall@10: 0.39386



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: distiluse-base-multilingual-cased-v2 Cross-Encoder: True Recall@1: 0.18558


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: distiluse-base-multilingual-cased-v2 Cross-Encoder: True Recall@5: 0.33027


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: distiluse-base-multilingual-cased-v2 Cross-Encoder: True Recall@10: 0.39386




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=968.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3743.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=645.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=122.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=229.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=470693617.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=53.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=239.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9081518.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=14763234.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=190.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=38.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: paraphrase-multilingual-MiniLM-L12-v2 Cross-Encoder: False Recall@1: 0.15037


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: paraphrase-multilingual-MiniLM-L12-v2 Cross-Encoder: False Recall@5: 0.27036


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: paraphrase-multilingual-MiniLM-L12-v2 Cross-Encoder: False Recall@10: 0.33495



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: paraphrase-multilingual-MiniLM-L12-v2 Cross-Encoder: True Recall@1: 0.15037


HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))

KeyboardInterrupt: ignored

In [15]:
cross_encoder_flags = [False]
for model_name in tqdm(model_names):
    bi_encoder = SentenceTransformer(model_name)
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
    corpus_embeddings = bi_encoder.encode([" ".join(
        sent) for sent in tokenized_corpus], convert_to_tensor=True, show_progress_bar=True)
    for cross_encoder_flag in tqdm(cross_encoder_flags):
        MRR = 0
        for query_idx in tqdm(list(questions.keys())):
            retrieved_answers = search(questions[query_idx], top_k=1000, model_name=model_name, cross_encoder_flag=cross_encoder_flag, bi_encoder=bi_encoder, cross_encoder=cross_encoder)
            if texts[retrieved_results[query_idx]] in retrieved_answers:
                MRR += (1 /
                        (retrieved_answers.index(texts[retrieved_results[query_idx]]) + 1))
        print(
            f"Model: {model_name} Cross-Encoder: {cross_encoder_flag} MRR: {round(MRR / len(list(questions.keys())), 5)}")

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Batches', max=38.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: distiluse-base-multilingual-cased-v2 Cross-Encoder: False MRR: 0.25851



HBox(children=(FloatProgress(value=0.0, description='Batches', max=38.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5992.0), HTML(value='')))


Model: paraphrase-multilingual-MiniLM-L12-v2 Cross-Encoder: False MRR: 0.21308


