In [1]:

import sys
import os

!pip install transformers --quiet
!pip install openai --quiet
!pip install more_itertools --quiet
sys.path.append(os.path.abspath(os.path.join(os.getcwd(),"..", "helper")))
from bart_limitations import generate_lims
print("Done")

Done


In [2]:
import json
import torch
import os
import gc
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

generate_lims(
    model_path="../Generate_limitations/training_model/model_output_abstract/final",
    input_path = "test_inputs/abstract_conclusion_test.json",
    output_path="test_outputs/generated_limitations_abstract.jsonl",
    check_limitations=False
)
generate_lims(
    model_path="../Generate_limitations/training_model/model_output_tokenized/final",
    input_path = "test_inputs/tokenized_test.json",
    output_path="test_outputs/generated_limitations_tokenized.jsonl",
    check_limitations=False
)
generate_lims(
    model_path="../Generate_limitations/training_model/model_output_full/final",
    input_path = "test_inputs/full_text_test.json",
    output_path="test_outputs/generated_limitations_full.jsonl",
    check_limitations=False
)


100%|██████████| 1/1 [00:08<00:00,  8.26s/it]


Loop Finished


100%|██████████| 1/1 [00:04<00:00,  4.54s/it]


Loop Finished


100%|██████████| 1/1 [00:05<00:00,  5.24s/it]

Loop Finished





In [8]:
!pip install nltk --quiet
import os
import json
from tqdm import tqdm 
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os


os.environ["CUDA_VISIBLE_DEVICES"] = ""
generated_files = {}
smooth_function = SmoothingFunction().method4

def clean_bullet_points(bullets):
    if isinstance(bullets, str):
        return [b.strip() for b in bullets.split(" - ") if b.strip()]
    elif isinstance(bullets, list):
        return bullets
    return []

def flat_bullets(text):
    if isinstance(text, list):
        return " ".join(text)
    elif isinstance(text, str):
        return text
    return ""
    
ground_truth_file = "limitations_2024/limitations_only_gpt4_nano.json"
with open(ground_truth_file, "r", encoding="utf-8") as f:
    ground_truth = {
        i["paper"].replace(".pdf", ""): i["target_bullets"] 
        for i in json.load(f) 
        if "paper" in i and "target_bullets" in i}
    
generated_files = {
    "abstract": "test_outputs/generated_limitations_abstract.jsonl",
    "tokenized": "test_outputs/generated_limitations_tokenized.jsonl",
    "full": "test_outputs/generated_limitations_full.jsonl",
}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
def bleu_score(ground, text):
    ref_bullets = clean_bullet_points(ground)
    hyp_bullets = clean_bullet_points(text)

    ref = [" ".join(ref_bullets).split()]
    hyp = "  ".join(hyp_bullets).split()
    if not ref or not hyp:
        return 0
    return sentence_bleu(ref, hyp, smoothing_function=smooth_function)

for model, path in generated_files.items():
    score = []
    with open(path, "r", encoding="utf-8") as f:
        for l in f:
            item = json.loads(l)
            paper = item["paper"]
            ground = ground_truth.get(paper)
            generated = item["generated"]
            bleu = bleu_score(ground, generated)
            score.append(bleu)
    mean = sum(score) / len(score) if score else 0
    print(f"Bleu score for {model}: {mean:4f} (from {len(score)} papers)")

Bleu score for abstract: 0.015465 (from 5 papers)
Bleu score for tokenized: 0.017546 (from 5 papers)
Bleu score for full: 0.017826 (from 5 papers)


In [10]:
!pip install rouge-score --quiet

from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

for model, path in generated_files.items():
    r1 = []
    rl = []
    with open(path, "r", encoding="utf-8") as f:
        for l in f:
            item = json.loads(l)
            paper = item["paper"]
            ground = ground_truth.get(paper)
            generated = flat_bullets(item["generated"])
            
            score = scorer.score(ground, generated)
            r1.append(score["rouge1"].fmeasure)
            rl.append(score["rougeL"].fmeasure)
    mean_r1 = sum(r1) / len(r1) if r1 else 0
    mean_rl = sum(rl) / len(rl) if rl else 0
    print(f"Rouge score for {model}:")
    print(f"Rouge-1 F1: {mean_r1:.4f}")
    print(f"Rouge-L F1: {mean_rl:.4f}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Rouge score for abstract:
Rouge-1 F1: 0.2909
Rouge-L F1: 0.1600
Rouge score for tokenized:
Rouge-1 F1: 0.2789
Rouge-L F1: 0.1581
Rouge score for full:
Rouge-1 F1: 0.3011
Rouge-L F1: 0.1800


In [12]:
!pip install bert-score
from bert_score import score
import json

for model, path in generated_files.items():
    refs = []
    hyps = []
    with open(path, "r", encoding="utf-8") as f:
        for l in f:
            item = json.loads(l)
            paper = item["paper"].replace(".pdf", "")
            ground = flat_bullets(ground_truth.get(paper))
            generated = flat_bullets(item["generated"])
            
            refs.append(ground)
            hyps.append(generated)
    P, R, F1 = score(hyps, refs, lang="en",  device="cpu", verbose=False)
    print(F1)
    mean_f1 = F1.mean().item()
    print(f"BERT score for {model.upper()}:")
    print(f" F1: {mean_f1:.4f}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([0.8660, 0.8717, 0.8609, 0.8737, 0.8512])
BERT score for ABSTRACT:
 F1: 0.8647


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([0.8816, 0.8740, 0.8584, 0.8530, 0.8496])
BERT score for TOKENIZED:
 F1: 0.8633


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([0.8712, 0.8643, 0.8645, 0.8741, 0.8546])
BERT score for FULL:
 F1: 0.8657
