In [1]:
from google.colab import drive

In [2]:
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
!pip install transformers



In [5]:
import pandas as pd

# Load the noteevents.csv data
data = pd.read_csv('/content/drive/MyDrive/student projects/medical summary/noteevents.csv')
data.head(10)

Unnamed: 0.1,Unnamed: 0,subject_id,chartdate,category,text
0,0,0,01/01/2086,Urology,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."
1,1,0,01/01/2086,Emergency Room Reports,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."
2,2,0,01/01/2086,General Medicine,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."
3,3,0,01/01/2086,General Medicine,"CHIEF COMPLAINT:, Followup on hypertension an..."
4,4,0,01/01/2086,Consult - History and Phy.,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."
5,5,0,01/01/2086,Consult - History and Phy.,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."
6,6,1,01/01/2079,General Medicine,"HISTORY OF PRESENT ILLNESS:, The patient is a ..."
7,7,1,01/01/2079,Rheumatology,"HISTORY OF PRESENT ILLNESS: , A 71-year-old fe..."
8,8,1,01/01/2079,Consult - History and Phy.,"HISTORY OF PRESENT ILLNESS:, The patient is a ..."
9,9,2,01/01/2037,Consult - History and Phy.,"CHIEF COMPLAINT:,1. Infection.,2. Pelvic pai..."


In [6]:
# Extract the text from the first row
text = data.loc[0, 'text']

# Remove unwanted characters and replace commas with line breaks
text = text.replace('CHIEF COMPLAINT: ,', 'CHIEF COMPLAINT:')
text = text.replace(',HISTORY OF', 'HISTORY OF')
text = text.replace(',', '\n')

# Print the formatted report text
print(text)

CHIEF COMPLAINT: Blood in urine.HISTORY OF PRESENT ILLNESS:  
This is a 78-year-old male who has prostate cancer with metastatic disease to his bladder and in several locations throughout the skeletal system including the spine and shoulder.  The patient has had problems with hematuria in the past
 but the patient noted that this episode began yesterday
 and today he has been passing principally blood with very little urine.  The patient states that there is no change in his chronic lower back pain and denies any incontinence of urine or stool.  The patient has not had any fever.  There is no abdominal pain and the patient is still able to pass urine.  The patient has not had any melena or hematochezia.  There is no nausea or vomiting.  The patient has already completed chemotherapy and is beyond treatment for his cancer at this time.  The patient is receiving radiation therapy
 but it is targeted to the bones and intended to give symptomatic relief of his skeletal pain and not intende

In [7]:
# Split the text into sections based on section headers
sections = {}
current_section = None
for line in text.split("\n"):
    line = line.strip()
    if line.endswith(":"):
        current_section = line
        sections[current_section] = []
    elif current_section and line:
        sections[current_section].append(line)

# Save the grouped report sections to a .txt file
output_filename = 'clinical_report.txt'
with open(output_filename, 'w') as file:
    for section, sentences in sections.items():
        file.write(section + '\n')
        file.write('\n'.join(sentences) + '\n\n')

print("Clinical report saved as 'clinical_report.txt'")

Clinical report saved as 'clinical_report.txt'


In [8]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load clinical reports from a text file
clinical_reports = []
with open('clinical_report.txt', 'r') as file:
    clinical_reports = [line.strip() for line in file]

# Tokenize the clinical reports
tokenized_reports = [tokenizer.encode(report, add_special_tokens=True) for report in clinical_reports]

# Pad tokens to the same length
max_length = max(len(tokens) for tokens in tokenized_reports)
padded_reports = [tokens + [tokenizer.pad_token_id] * (max_length - len(tokens)) for tokens in tokenized_reports]

# Convert to PyTorch tensors
input_ids = torch.tensor(padded_reports)

# Create attention masks
attention_masks = torch.ones_like(input_ids)
attention_masks[input_ids == tokenizer.pad_token_id] = 0

# Create a dictionary for mapping tokenized reports to embeddings
report_embeddings = {}
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_masks)
    embeddings = outputs.last_hidden_state
    for i, tokens in enumerate(tokenized_reports):
        report_embeddings[clinical_reports[i]] = embeddings[i]




config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
# Calculate sentence importance scores (simplified, using sum of embeddings)
sentence_scores = [embedding.sum().item() for embedding in embeddings]

# Select top sentences based on scores
num_sentences_in_summary = 1
selected_indices = sorted(range(len(sentence_scores)), key=lambda i: sentence_scores[i], reverse=False)[:num_sentences_in_summary]
summary = [clinical_reports[i] for i in selected_indices]
summary_text="\n".join(summary)
# Print the summary
print(summary_text)

and Dr. X said he would be happy to care for the patient in the hospital and do urologic scopes if necessary and surgery if necessary and blood transfusion.  It was all a matter of what the patient wished to do given the advanced stage of his cancer.  Dr. X was willing to assist in any way the patient wished him to.  I spoke with the patient and his son about what he would like to do and what the options were from doing nothing from keeping him comfortable with pain medicines to admitting him to the hospital with the possibility of scopes and even surgery being done as well as the blood transfusion.  The patient decided to choose a middle ground in which he would be transfused with 2 units of blood here in the emergency room and go home tonight.  The patient's son felt comfortable with his father's choice.  This was done.  The patient was transfused 2 units of packed red blood cells after appropriately typed and match.  The patient did not have any adverse reaction at any point with hi

In [10]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=74fce8b3315fb80a305afde079986b0ea0d523edf47ea766235e1f239b598343
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [11]:
from rouge_score import rouge_scorer

# Your generated summaries
generated_summaries = [
    ["and Dr. X said he would be happy to care for the patient in the hospital and do urologic scopes if necessary and surgery if necessary and blood transfusion.  It was all a matter of what the patient wished to do given the advanced stage of his cancer.  Dr. X was willing to assist in any way the patient wished him to.  I spoke with the patient and his son about what he would like to do and what the options were from doing nothing from keeping him comfortable with pain medicines to admitting him to the hospital with the possibility of scopes and even surgery being done as well as the blood transfusion.  The patient decided to choose a middle ground in which he would be transfused with 2 units of blood here in the emergency room and go home tonight.  The patient's son felt comfortable with his father's choice.  This was done.  The patient was transfused 2 units of packed red blood cells after appropriately typed and match.  The patient did not have any adverse reaction at any point with his transfusion.  There was no fever"],
] # List of generated summaries

# Reference summaries
reference_summaries = [
    ["Dr. X offered a range of care options to the patient, given the advanced stage of his cancer, including hospitalization, urologic procedures, surgery, and blood transfusion. After discussing these options with the patient and his son, they chose a middle-ground approach. The patient received 2 units of blood transfusion in the emergency room and went home without any adverse reactions or fever."],
]

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

# Calculate ROUGE scores
total_rouge1 = 0
total_rouge2 = 0
total_rougeL = 0

for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
    scores = scorer.score(' '.join(gen_summary), ' '.join(ref_summary))
    total_rouge1 += scores['rouge1'].fmeasure
    total_rouge2 += scores['rouge2'].fmeasure
    total_rougeL += scores['rougeL'].fmeasure

average_rouge1 = total_rouge1 / len(generated_summaries)
average_rouge2 = total_rouge2 / len(generated_summaries)
average_rougeL = total_rougeL / len(generated_summaries)

print("Average ROUGE-1 F1:", average_rouge1)
print("Average ROUGE-2 F1:", average_rouge2)
print("Average ROUGE-L F1:", average_rougeL)


Average ROUGE-1 F1: 0.37154150197628455
Average ROUGE-2 F1: 0.2231075697211155
Average ROUGE-L F1: 0.27667984189723316


In [12]:
# Install the necessary library
!pip install bert-extractive-summarizer

Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl.metadata (15 kB)
Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl (25 kB)
Installing collected packages: bert-extractive-summarizer
Successfully installed bert-extractive-summarizer-0.10.1


In [13]:
# Import libraries
#bert-extractive-summarizer
from summarizer import Summarizer

# Load the BERT model
model = Summarizer()


# Set the desired number of sentences in the summary
num_sentences = 3  # Adjust this number as needed

# Generate the summary with the specified number of sentences
summary = model(text, num_sentences=num_sentences)

# Print the summary
print(summary)


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

HISTORY OF PRESENT ILLNESS:  
This is a 78-year-old male who has prostate cancer with metastatic disease to his bladder and in several locations throughout the skeletal system including the spine and shoulder. NEUROLOGIC:  Motor and sensory are intact to the extremities. Over the course of the patient's several-hour stay in the emergency room
 the patient did end up developing enough problems with clotted blood in his bladder that he had a urinary obstruction.


  super()._check_params_vs_input(X, default_n_init=10)


In [14]:
from rouge_score import rouge_scorer

# Your generated summaries
generated_summaries = [
    ['''HISTORY OF PRESENT ILLNESS:
This is a 78-year-old male who has prostate cancer with metastatic disease to his bladder and in several locations throughout the skeletal system including the spine and shoulder. NEUROLOGIC:  Motor and sensory are intact to the extremities. Over the course of the patient's several-hour stay in the emergency room
 the patient did end up developing enough problems with clotted blood in his bladder that he had a urinary obstruction.''']
] # List of generated summaries

# Reference summaries
reference_summaries = [
    ["A 78-year-old male with prostate cancer and metastases to his bladder and skeletal system, including the spine and shoulder, presented. Neurologically, motor and sensory functions were intact. During his stay in the emergency room, he experienced urinary obstruction due to clotted blood in the bladder."],
]

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

# Calculate ROUGE scores
total_rouge1 = 0
total_rouge2 = 0
total_rougeL = 0

for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
    scores = scorer.score(' '.join(gen_summary), ' '.join(ref_summary))
    total_rouge1 += scores['rouge1'].fmeasure
    total_rouge2 += scores['rouge2'].fmeasure
    total_rougeL += scores['rougeL'].fmeasure

average_rouge1 = total_rouge1 / len(generated_summaries)
average_rouge2 = total_rouge2 / len(generated_summaries)
average_rougeL = total_rougeL / len(generated_summaries)

print("Average ROUGE-1 F1:", average_rouge1)
print("Average ROUGE-2 F1:", average_rouge2)
print("Average ROUGE-L F1:", average_rougeL)

Average ROUGE-1 F1: 0.6129032258064516
Average ROUGE-2 F1: 0.3770491803278689
Average ROUGE-L F1: 0.5


In [15]:
pip install sentencepiece



In [16]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the pre-trained T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize and generate summary
input_text = "summarize: " + text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(input_ids, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Generated Summary:")
print(summary)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Generated Summary:
the patient has had problems with hematuria in the past but the patient noted that this episode began yesterday and today he has been passing principally blood with very little urine. the patient states that there is no change in his chronic lower back pain and denies any incontinence of urine or stool. the patient has already completed chemotherapy and is beyond treatment for his cancer at this time.


In [17]:
from rouge_score import rouge_scorer

# Your generated summaries
generated_summaries = [
    ["the patient has had problems with hematuria in the past but the patient noted that this episode began yesterday and today he has been passing principally blood with very little urine. the patient states that there is no change in his chronic lower back pain and denies any incontinence of urine or stool. the patient has already completed chemotherapy and is beyond treatment for his cancer at this time."]
] # List of generated summaries

# Reference summaries
reference_summaries = [
    ["the patient experienced problems with hematuria previously but the patient observed that this episode started  yesterday and today he has been passing principally blood with very little urine. the patient states that there is no change in his chronic lower back discomfort and denies any incontinence of urine or stool. the patient has already completed chemotherapy and is beyond treatment for his cancer at this time."]
]

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

# Calculate ROUGE scores
total_rouge1 = 0
total_rouge2 = 0
total_rougeL = 0

for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
    scores = scorer.score(' '.join(gen_summary), ' '.join(ref_summary))
    total_rouge1 += scores['rouge1'].fmeasure
    total_rouge2 += scores['rouge2'].fmeasure
    total_rougeL += scores['rougeL'].fmeasure

average_rouge1 = total_rouge1 / len(generated_summaries)
average_rouge2 = total_rouge2 / len(generated_summaries)
average_rougeL = total_rougeL / len(generated_summaries)

print("Average ROUGE-1 F1:", average_rouge1)
print("Average ROUGE-2 F1:", average_rouge2)
print("Average ROUGE-L F1:", average_rougeL)

Average ROUGE-1 F1: 0.9037037037037037
Average ROUGE-2 F1: 0.8270676691729324
Average ROUGE-L F1: 0.9037037037037037
