In [8]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

def read_input_file(file_path):
    with open(file_path, 'r') as file:
        return file.read().strip()

def generate_summary(prompt, model_name="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", max_length=512):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name)
    
    prompt = "Summary: " + prompt
    
    input_ids = tokenizer.encode(prompt, return_tensors='pt', truncation=True, max_length=max_length)
    
    # Generate a response using the model
    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs.logits

    # Just get the predicted tokens for now, it's hacky but might provide a compressed form of the content
    predicted_ids = torch.argmax(predictions, dim=2)
    summary = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)

    return summary

if __name__ == "__main__":
    file_path = "/kaggle/input/full-texts/Abstract-2529.txt"
    input_text = read_input_file(file_path)
    generated_summary = generate_summary(input_text)
    print(generated_summary)


Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


##s summary : we report a rare case of immunoglobulin g4 ( igg4 ) - related sclerosing cholangitis without other organ involvement. a 69 - year - old - man was referred for the evaluation of jaundice. computed tomography revealed thickening of the bile duct wall, compressing the right portal vein. endoscopic retrograde cholangiopancreatography showed a lesion extending from the proximal confluence of the common bile duct to the left and right hepatic ducts. intraductal ultrasonography showed a bile duct mass invading the portal vein. hilar bile duct cancer was initially diagnosed and percutaneous transhepatic portal vein embolization was performed, preceding a planned right hepatectomy. strictures persisted despite steroid therapy. therefore, partial resection of the common bile duct following choledochojejunostomy was performed. histologic examination showed diffuse and severe lymphoplasmacytic infiltration, and abundant plasma cells, which stained positive for anti - igg4 antibody. t

In [9]:
!pip install rouge-score



In [10]:
from rouge_score import rouge_scorer

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

if __name__ == '__main__':
    file_path = "/kaggle/input/full-texts/Abstract-2529.txt"
    input_abstract = read_input_file(file_path)
    reference =  input_abstract  
    candidate = generated_summary

    scores = compute_rouge(reference, candidate)
    for key, score in scores.items():
        print(f"{key.upper()}:")
        print(f"  Precision: {score.precision:.4f}")
        print(f"  Recall: {score.recall:.4f}")
        print(f"  F1 Score: {score.fmeasure:.4f}\n")

ROUGE1:
  Precision: 0.9880
  Recall: 1.0000
  F1 Score: 0.9940

ROUGE2:
  Precision: 0.9880
  Recall: 1.0000
  F1 Score: 0.9939

ROUGEL:
  Precision: 0.9880
  Recall: 1.0000
  F1 Score: 0.9940

ROUGELSUM:
  Precision: 0.9880
  Recall: 1.0000
  F1 Score: 0.9940



In [1]:
from transformers import BertTokenizer, BertForQuestionAnswering

MODEL_NAME_OR_PATH = "dmis-lab/biobert-base-cased-v1.1-squad"

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
model = BertForQuestionAnswering.from_pretrained(MODEL_NAME_OR_PATH)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [12]:
# 3. Define a function to perform question-answering using BioBERT.
def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)  
    answer_end = torch.argmax(answer_end_scores) + 1 

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    return answer

# 4. Use the function to ask questions.
context = generated_summary
question = "what is the age of the man"

print(answer_question(question, context))

69
