In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
from datasets import load_dataset

def read_input_file(file_path):
    with open(file_path, 'r') as file:
        return file.read().strip()

def generate_summary(prompt, model_name="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", max_length=512):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name)
    
    prompt = "Summary: " + prompt
    
    input_ids = tokenizer.encode(prompt, return_tensors='pt', truncation=True, max_length=max_length)
    
    # Generate a response using the model
    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs.logits

    # Just get the predicted tokens for now, it's hacky but might provide a compressed form of the content
    predicted_ids = torch.argmax(predictions, dim=2)
    summary = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)

    return summary

if __name__ == "__main__":
    file_path = "/kaggle/input/full-texts/Abstract-2529.txt"
    input_text = read_input_file(file_path)
    generated_summary = generate_summary(input_text)
    print(generated_summary)


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


##s summary : we report a rare case of immunoglobulin g4 ( igg4 ) - related sclerosing cholangitis without other organ involvement. a 69 - year - old - man was referred for the evaluation of jaundice. computed tomography revealed thickening of the bile duct wall, compressing the right portal vein. endoscopic retrograde cholangiopancreatography showed a lesion extending from the proximal confluence of the common bile duct to the left and right hepatic ducts. intraductal ultrasonography showed a bile duct mass invading the portal vein. hilar bile duct cancer was initially diagnosed and percutaneous transhepatic portal vein embolization was performed, preceding a planned right hepatectomy. strictures persisted despite steroid therapy. therefore, partial resection of the common bile duct following choledochojejunostomy was performed. histologic examination showed diffuse and severe lymphoplasmacytic infiltration, and abundant plasma cells, which stained positive for anti - igg4 antibody. t

In [2]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=a8177857c55417188b02a4628239cee90c7f897d7f972904b6ce6e3e9c25aeaf
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [3]:
from rouge_score import rouge_scorer

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

if __name__ == '__main__':
    file_path = "/kaggle/input/full-texts/Abstract-2529.txt"
    input_abstract = read_input_file(file_path)
    reference =  input_abstract  
    candidate = generated_summary

    scores = compute_rouge(reference, candidate)
    for key, score in scores.items():
        print(f"{key.upper()}:")
        print(f"  Precision: {score.precision:.4f}")
        print(f"  Recall: {score.recall:.4f}")
        print(f"  F1 Score: {score.fmeasure:.4f}\n")

ROUGE1:
  Precision: 0.9880
  Recall: 1.0000
  F1 Score: 0.9940

ROUGE2:
  Precision: 0.9880
  Recall: 1.0000
  F1 Score: 0.9939

ROUGEL:
  Precision: 0.9880
  Recall: 1.0000
  F1 Score: 0.9940

ROUGELSUM:
  Precision: 0.9880
  Recall: 1.0000
  F1 Score: 0.9940



In [4]:
from transformers import BertTokenizer, BertForQuestionAnswering

MODEL_NAME_OR_PATH = "dmis-lab/biobert-base-cased-v1.1-squad"

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
model = BertForQuestionAnswering.from_pretrained(MODEL_NAME_OR_PATH)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [5]:
# 3. Define a function to perform question-answering using BioBERT.
def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)  
    answer_end = torch.argmax(answer_end_scores) + 1 

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    return answer

# 4. Use the function to ask questions.
context = generated_summary
question = "give diagnosis"

answer= answer_question(question, context)

In [6]:
print("Question: "+question)
print("Context: "+ context)
print("Answer: "+ answer )

Question: give diagnosis
Context: ##s summary : we report a rare case of immunoglobulin g4 ( igg4 ) - related sclerosing cholangitis without other organ involvement. a 69 - year - old - man was referred for the evaluation of jaundice. computed tomography revealed thickening of the bile duct wall, compressing the right portal vein. endoscopic retrograde cholangiopancreatography showed a lesion extending from the proximal confluence of the common bile duct to the left and right hepatic ducts. intraductal ultrasonography showed a bile duct mass invading the portal vein. hilar bile duct cancer was initially diagnosed and percutaneous transhepatic portal vein embolization was performed, preceding a planned right hepatectomy. strictures persisted despite steroid therapy. therefore, partial resection of the common bile duct following choledochojejunostomy was performed. histologic examination showed diffuse and severe lymphoplasmacytic infiltration, and abundant plasma cells, which stained po

In [7]:
dataset = load_dataset('pubmed_qa', 'pqa_labeled')

Downloading builder script:   0%|          | 0.00/3.00k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading and preparing dataset pubmed_qa/pqa_labeled (download: 656.02 MiB, generated: 1.99 MiB, post-processed: Unknown size, total: 658.01 MiB) to /root/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/2e65addecca4197502cd10ab8ef1919a47c28672f62d7abac7cc9afdcf24fb2d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/709k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/152M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/533M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset pubmed_qa downloaded and prepared to /root/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/2e65addecca4197502cd10ab8ef1919a47c28672f62d7abac7cc9afdcf24fb2d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
total_f1, total_exact = 0, 0

for sample in dataset['train']:
    question = sample['question']
    context = sample['context']['contexts']
    correct_answer = sample['long_answer']

    # Encode and get model's prediction
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt", max_length=512, truncation=True)
    input_ids = inputs["input_ids"].tolist()[0]
    
    with torch.no_grad():
        output = model(**inputs)
    
    # Extract answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits) + 1
    pred_answer = tokenizer.decode(input_ids[answer_start:answer_end])

    # Compute F1 and Exact match
    common = set(pred_answer.lower().split()) & set(correct_answer.lower().split())
    f1 = 2 * len(common) / (len(pred_answer.split()) + len(correct_answer.split()))
    total_f1 += f1

    if pred_answer.lower() == correct_answer.lower():
        total_exact += 1

# Average F1 and Exact match
avg_f1 = total_f1 / len(dataset['train'])
avg_exact = total_exact / len(dataset['train'])

print(f"Average F1 Score: {avg_f1}")
print(f"Exact Match: {avg_exact}")

Average F1 Score: 0.08838785016297217
Exact Match: 0.0
