In [4]:
# import models
import sys
import os
!pip install transformers
sys.path.append(os.path.abspath(os.path.join(os.getcwd(),"..", "helper")))
from bart_limitations import generate_lims

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.31.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.31.2-py3-none-any.whl (484 kB)
Downloading regex-2024.11.6-cp311-cp311-m

In [2]:
import json
import torch
import os
import gc
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

generate_lims(
    model_path="../Generate_limitations/training_model/model_output_abstract/final",
    input_path = "test_inputs/abstract_conclusion_test.json",
    output_path="test_outputs/generated_limitations_abstract.jsonl"
)
generate_lims(
    model_path="../Generate_limitations/training_model/model_output_tokenized/final",
    input_path = "test_inputs/tokenized_test.json",
    output_path="test_outputs/generated_limitations_tokenized.jsonl"
)
generate_lims(
    model_path="../Generate_limitations/training_model/model_output_full/final",
    input_path = "test_inputs/full_text_test.json",
    output_path="test_outputs/generated_limitations_full.jsonl"
)




In [36]:
import os
import json
from tqdm import tqdm 
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os


os.environ["CUDA_VISIBLE_DEVICES"] = ""

smooth_function = SmoothingFunction().method4

def clean_bullet_points(bullets):
    if isinstance(bullets, str):
        return [b.strip() for b in bullets.split(" - ") if b.strip()]
    elif isinstance(text, list):
        return text
    return []

def flat_bullets(text):
    if isinstance(text, list):
        return " ".join(text)
    elif isinstance(text, str):
        return text
    return ""
    
ground_truth_file = "limitations_2024/limitations_only_gpt4_nano.json"
with open(ground_truth_file, "r", encoding="utf-8") as f:
    ground_truth = {
        i["paper"].replace(".pdf", ""): i["target_bullets"] 
        for i in json.load(f) 
        if "paper" in i and "target_bullets" in i}
    
generated_files = {
    "abstract": "test_outputs/generated_limitations_abstract.jsonl",
    "tokenized": "test_outputs/generated_limitations_tokenized.jsonl",
    "full": "test_outputs/generated_limitations_full.jsonl",
}

In [28]:
def bleu_score(ground, text):
    ref_bullets = clean_bullet_points(ground)
    hyp_bullets = clean_bullet_points(text)

    ref = [" ".join(ref_bullets).split()]
    hyp = "  ".join(hyp_bullets).split()
    if not ref or not hyp:
        return 0
    return sentence_bleu(ref, hyp, smoothing_function=smooth_function)

for model, path in generated_files.items():
    score = []
    with open(path, "r", encoding="utf-8") as f:
        for l in f:
            item = json.loads(l)
            paper = item["paper"]
            ground = ground_truth.get(paper)
            generated = item["generated"]
            bleu = bleu_score(ground, generated)
            score.append(bleu)
    mean = sum(score) / len(score) if score else 0
    print(f"Bleu score for {model}: {mean:4f} (from {len(score)} papers)")

Bleu score for abstract: 0.025879 (from 5 papers)
Bleu score for tokenized: 0.027108 (from 5 papers)
Bleu score for full: 0.033661 (from 5 papers)


In [29]:
!pip install rouge-score

from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

for model, path in generated_files.items():
    r1 = []
    rl = []
    with open(path, "r", encoding="utf-8") as f:
        for l in f:
            item = json.loads(l)
            paper = item["paper"]
            ground = ground_truth.get(paper)
            generated = flat_bullets(item["generated"])
            
            score = scorer.score(ground, generated)
            r1.append(score["rouge1"].fmeasure)
            rl.append(score["rougeL"].fmeasure)
    mean_r1 = sum(r1) / len(r1) if r1 else 0
    mean_rl = sum(rl) / len(rl) if rl else 0
    print(f"Rouge score for {model}:")
    print(f"Rouge-1 F1: {mean_r1:.4f}")
    print(f"Rouge-L F1: {mean_rl:.4f}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Rouge score for abstract:
Rouge-1 F1: 0.2996
Rouge-L F1: 0.1671
Rouge score for tokenized:
Rouge-1 F1: 0.2999
Rouge-L F1: 0.1749
Rouge score for full:
Rouge-1 F1: 0.3232
Rouge-L F1: 0.1820


In [40]:
# !pip install bert-score
from bert_score import score
import json

for model, path in generated_files.items():
    refs = []
    hyps = []
    with open(path, "r", encoding="utf-8") as f:
        for l in f:
            item = json.loads(l)
            paper = item["paper"].replace(".pdf", "")
            ground = flat_bullets(ground_truth.get(paper))
            generated = flat_bullets(item["generated"])
            
            refs.append(ground)
            hyps.append(generated)
    P, R, F1 = score(hyps, refs, lang="en",  device="cpu", verbose=False)
    print(F1)
    mean_f1 = F1.mean().item()
    print(f"BERT score for {model.upper()}:")
    print(f" F1: {mean_f1:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([0.8764, 0.8754, 0.8717, 0.8693, 0.8624])
BERT score for ABSTRACT:
 F1: 0.8710


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([0.8757, 0.8713, 0.8660, 0.8669, 0.8427])
BERT score for TOKENIZED:
 F1: 0.8645


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([0.8792, 0.8639, 0.8584, 0.8678, 0.8764])
BERT score for FULL:
 F1: 0.8691
