In [None]:
import os 

hypos = []
for root, dirs, files in os.walk("hypos"):
    for file in files:
        if file.startswith("hypo") and file.endswith(".txt"):
            with open(os.path.join(root, file)) as f:
                hypos.append([line.strip() for line in f])
                
ground_truths = []
for root, dirs, files in os.walk("ground_truths"):
    for file in files:
        if file.startswith("gt") and file.endswith(".txt"):
            with open(os.path.join(root, file)) as f:
                ground_truths.append([line.strip() for line in f])

In [None]:
len(hypos), len(ground_truths)

In [None]:
import unicodedata
import re

def read_vi_text(text):
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r'\s+', ' ', text)
    return text

read_vi = []
for i, gt in enumerate(ground_truths):
    ground_truths[i] = [read_vi_text(line) for line in gt]
    read_vi.append(ground_truths[i])

In [None]:
read_vi

In [None]:
import textwrap

for ref in read_vi[0]:
    print(textwrap.fill(ref, width=120))
    print()

In [None]:
from unidecode import unidecode
import re

def normalize_text(text):
    """
    Normalize Vietnamese text by converting it to its ASCII representation and removing accents.
    
    Args:
    text (str): Input Vietnamese text to be normalized.
    
    Returns:
    str: Normalized text with accents removed.
    """
    normalized_text = unidecode(text)
    normalized_text = re.sub(r'\s+', ' ', normalized_text).strip().lower()
    return normalized_text


In [None]:
normalized_hypos = [[' '.join(normalize_text(text) for text in hypo)] for hypo in hypos]

In [None]:
len(normalized_hypos)

In [None]:
normalized_ground_truths = [[' '.join(normalize_text(text) for text in ground_truth)] for ground_truth in ground_truths]

In [None]:
len(normalized_ground_truths)

In [None]:
import textwrap

def wraptext(documents, width=120):
    for idx, doc in enumerate(documents, start=1):
        wrapped_lines = textwrap.wrap(doc, width=width)
        for line in wrapped_lines:
            print(line)
        print("-" * width)

In [None]:
for i in range(len(normalized_ground_truths)):
    print(f"Document {i + 1}:")
    wraptext(normalized_ground_truths[i])

In [None]:
for i in range(len(normalized_hypos)):
    print(f"Document {i + 1}:")
    wraptext(normalized_hypos[i])

### Using BERTScore for Evaluation

1. BERTScore is used to measure textual similarity between candidate texts and reference texts. It considers not only exact word matches but also the overall meaning, fluency, and order of the ouput.

2. BERTScore: Precision, Recall, F1
    * Precision measures how well the candidate texts avoid introducing irrelevant content.
    * Recall measures how well the candidate texts avoid omitting relevant content.
    * F1 = 2 x (P x R)/(P + R) 

In [None]:
# Hide the loading messages
import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

In [None]:
from bert_score import score

In [None]:
def bert_score(hypos, refs, lang="vi"):
    bert_scores = []
    for hypo in hypos:
        for ref in refs:
            scores = score(hypo, ref, lang=lang, verbose=False)
            bert_scores.append(scores)
    return bert_scores

In [None]:
bert_scores = bert_score(normalized_hypos, normalized_ground_truths)

In [None]:
import numpy as np

bert_scores = np.array(bert_scores)

print(f"BERTScore:")
print(f"Precision: {bert_scores[:, 0].mean():.2f}")
print(f"Recall: {bert_scores[:, 1].mean():.2f}")
print(f"F1: {bert_scores[:, 2].mean():.2f}")

### Using ROUGE score for Evaluation

In [None]:
from rouge_score import rouge_scorer

def rouge_score(hypos, refs):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, hypo) for ref, hypo in zip(refs, hypos)]
    return scores

rouge_scores = []
for i, (hypo, ref) in enumerate(zip(normalized_hypos, normalized_ground_truths), start=1):
    rouge_scores.append(rouge_score(hypo, ref))

In [None]:
rouge_1_precisions = []
rouge_1_recalls = []
rouge_1_fmeasures = []
rouge_2_precisions = []
rouge_2_recalls = []
rouge_2_fmeasures = []
rouge_L_precisions = []
rouge_L_recalls = []
rouge_L_fmeasures = []

for rouge_score in rouge_scores:
    for scores in rouge_score:
        rouge_1 = scores['rouge1']
        rouge_2 = scores['rouge2']
        rouge_L = scores['rougeL']
        
        rouge_1_precisions.append(rouge_1.precision)
        rouge_1_recalls.append(rouge_1.recall)
        rouge_1_fmeasures.append(rouge_1.fmeasure)
        rouge_2_precisions.append(rouge_2.precision)
        rouge_2_recalls.append(rouge_2.recall)
        rouge_2_fmeasures.append(rouge_2.fmeasure)
        rouge_L_precisions.append(rouge_L.precision)
        rouge_L_recalls.append(rouge_L.recall)
        rouge_L_fmeasures.append(rouge_L.fmeasure)

In [None]:
rouge_1_precisions = np.array(rouge_1_precisions)
print(f"ROUGE-1 Precision: {rouge_1_precisions.mean():.2f}")
rouge_1_recalls = np.array(rouge_1_recalls)
print(f"ROUGE-1 Recall: {rouge_1_recalls.mean():.2f}")
rouge_1_fmeasures = np.array(rouge_1_fmeasures)
print(f"ROUGE-1 F1: {rouge_1_fmeasures.mean():.2f}")
rouge_2_precisions = np.array(rouge_2_precisions)
print(f"ROUGE-2 Precision: {rouge_2_precisions.mean():.2f}")
rouge_2_recalls = np.array(rouge_2_recalls)
print(f"ROUGE-2 Recall: {rouge_2_recalls.mean():.2f}")
rouge_2_fmeasures = np.array(rouge_2_fmeasures)
print(f"ROUGE-2 F1: {rouge_2_fmeasures.mean():.2f}")
rouge_L_precisions = np.array(rouge_L_precisions)
print(f"ROUGE-L Precision: {rouge_L_precisions.mean():.2f}")
rouge_L_recalls = np.array(rouge_L_recalls)
print(f"ROUGE-L Recall: {rouge_L_recalls.mean():.2f}")
rouge_L_fmeasures = np.array(rouge_L_fmeasures)
print(f"ROUGE-L F1: {rouge_L_fmeasures.mean():.2f}")

### Using Perplexity for Evaluation

In [None]:
# Define the function to calculate Perplexity score
from transformers import BertTokenizer, BertForMaskedLM
import torch


def perplexity_score(hypos, tokenizer, model):
    perplexity_scores = []
    for hypo in hypos:
        input_ids = tokenizer.encode(hypo[0], return_tensors="pt")
        # Truncate the sequence if it's longer than the model's maximum input length
        if input_ids.size(1) > 512:
            input_ids = input_ids[:, :512]
        with torch.no_grad():
            loss = model(input_ids, labels=input_ids)[0]
        perplexity = torch.exp(loss).item()
        perplexity_scores.append(perplexity)
    return perplexity_scores

# Load the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

# Calculate the Perplexity score
perplexity = perplexity_score(normalized_hypos, tokenizer, model)

In [None]:
perplexity = np.array(perplexity)

In [None]:
print(f"Perplexity: {perplexity.mean():.2f}")

### RESULTS

| METRIC   | Precision | Recall | F1 |
|---------|----------|-------|-------|
| BERTScore|0.81 |0.83 |0.82 |
| ROUGE-1| 0.82|0.50 |0.61 |
| ROUGE-2| 0.46|0.28 | 0.35|
| ROUGE-L| 0.40|0.25 |0.30 |

Perlexity = 1.06

### Total Time Executions

1. Câu hỏi 1: Tổng quan về ngành Điện tử - Viễn thông của Đại học Bách Khoa Hà Nội
2. Câu hỏi 2: Thời tiết ở Hà Nội trong 3 ngày tới như thế nào?
3. Câu hỏi 3: Sự phát triển của chíp bán dẫn ở Việt Nam như thế nào?

#### Đối với nhiệm vụ viết báo cáo nghiên cứu

|Time (s) | T1 | T2 | T3|
|----|----|----|----|
|    | 140.212| 106.878 | 140.509|

#### Đối với nhiệm vụ phân tích nguồn tham khảo

|Time (s) | T1 | T2 | T3|
|----|----|----| ----|
|    |151.931  | 145.522 | 185.061|

#### Đôi với nhiệm vụ viết khung báo cáo 

|Time (s) | T1 | T2 | T3|
|----|----|----| ----|
|    | 116.042 | 97.370 | 109.562|

#### Đối với nhiệm vụ viết câu trả lời cho câu hỏi

|Time (s) | T1 | T2 | T3|
|----|----|----| ----|
|    | 101.072 | 56.558 | 103.992|

### Review
1. Summarization is a complex task, even advanced models can struggle to accurately capture all the important information from a source text.

2. BERTScore or ROUGE that these metrics focus on things like including keywords from the source text, so that do not perfectly reflect human judgement of a good summary.

3. The response of chatbot from many different sources.


### HUMAN EVALUATION IS THE BEST WAY TO EVALUATE THE QUALITY OF A SUMMARY