In [28]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

In [38]:
df = pd.read_csv("Preprocessed_radiology_data.xls")
print(df.head())
print(df.columns)

          note_id  subject_id     hadm_id note_type  note_seq  \
0  10000032-RR-14    10000032  22595853.0        RR        14   
1  10000032-RR-15    10000032  22595853.0        RR        15   
2  10000032-RR-16    10000032  22595853.0        RR        16   
3  10000032-RR-18    10000032         NaN        RR        18   
4  10000032-RR-20    10000032         NaN        RR        20   

             charttime            storetime   Examination  \
0  2180-05-06 21:19:00  2180-05-06 23:32:00         CHEST   
1  2180-05-06 23:00:00  2180-05-06 23:26:00         LIVER   
2  2180-05-07 09:55:00  2180-05-07 11:15:00           NaN   
3  2180-06-03 12:46:00  2180-06-03 14:01:00    Ultrasound   
4  2180-07-08 13:18:00  2180-07-08 14:15:00  Paracentesis   

                                          Indication  \
0      with new onset ascites  // eval for infection   
1          year-old female with cirrhosis, jaundice.   
2   HCV cirrhosis c/b ascites, hiv on ART, h/o IV...   
3   year old woman

In [39]:
print(df.combined_text[0])
print(df.tokenized_text[0])

CHEST  with new onset ascites  // eval for infection Chest PA and lateral None. There is no focal consolidation, pleural effusion or pneumothorax.  Bilateral No acute cardiopulmonary process.
['chest', 'new', 'onset', 'ascites', 'eval', 'infection', 'chest', 'pa', 'lateral', 'focal', 'consolidation', 'pleural', 'effusion', 'pneumothorax', 'bilateral', 'acute', 'cardiopulmonary', 'process']


## TF-IDF Summary

In [15]:
def extractive_tfidf_summary(text, num_sentences=3):
    # Split the text into sentences
    sentences = text.split('.')
    
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Calculate cosine similarity between sentences
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Sum the cosine similarity scores for each sentence
    sentence_scores = cosine_sim.sum(axis=1)
    
    # Get indices of the top N sentences
    top_sentence_indices = sentence_scores.argsort()[-num_sentences:][::-1]
    
    # Select and return the top sentences
    summary = '. '.join([sentences[i] for i in top_sentence_indices])
    return summary

# Example: Apply to one of the combined texts
example_text = df['combined_text'][0]
summary_tfidf = extractive_tfidf_summary(example_text)
print(summary_tfidf)


 There is no focal consolidation, pleural effusion or pneumothorax. CHEST  with new onset ascites  // eval for infection Chest PA and lateral None.   Bilateral No acute cardiopulmonary process


In [59]:
# Compute readability scores for the summary
fk_score = textstat.flesch_kincaid_grade(summary_tfidf)
smog_score = textstat.smog_index(summary_tfidf)

print("Flesch-Kincaid Grade Level:", fk_score)
print("SMOG Index:", smog_score)


Flesch-Kincaid Grade Level: 11.5
SMOG Index: 11.9


## TextRank

In [53]:
import pytextrank

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Add PyTextRank to the pipeline
nlp.add_pipe("textrank")

# Define your text (e.g., clinical notes or any long text)
text = df['combined_text'][0]

# Process the text using spaCy
doc = nlp(text)

# Extract the top sentences based on PyTextRank's ranking
top_sentences = []
for sent in doc._.textrank.summary(limit_phrases=15, limit_sentences=5):
    top_sentences.append(sent.text)

# Print the extracted summary
summary_textrank = "\n".join(top_sentences)
print("Extracted Summary:\n", summary_textrank)


Extracted Summary:
 CHEST  with new onset ascites  // eval for infection Chest PA and lateral None.
There is no focal consolidation, pleural effusion or pneumothorax.  
Bilateral No acute cardiopulmonary process.


In [54]:
# Compute readability scores for the summary
fk_score = textstat.flesch_kincaid_grade(summary_textrank)
smog_score = textstat.smog_index(summary_textrank)

print("Flesch-Kincaid Grade Level:", fk_score)
print("SMOG Index:", smog_score)


Flesch-Kincaid Grade Level: 11.5
SMOG Index: 11.9


## T5

In [16]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the pre-trained T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def abstractive_t5_summary(text):
    # Encode the input text and generate the output
    input_text = f"summarize: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
    
    # Generate summary
    summary_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
    
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example: Apply to one of the combined texts
summary_t5 = abstractive_t5_summary(example_text)
print(summary_t5)


there is no focal consolidation, pleural effusion or pneumothorax.


In [48]:
# Compute readability scores for the summary
fk_score = textstat.flesch_kincaid_grade(summary_t5)
smog_score = textstat.smog_index(summary_t5)

print("Flesch-Kincaid Grade Level:", fk_score)
print("SMOG Index:", smog_score)

Flesch-Kincaid Grade Level: 13.9
SMOG Index: 0.0


## Clinical T5

In [None]:
model_path = "C:/Users/naomi/physionet.org/files/clinical-t5/1.0.0/Clinical-T5-Scratch"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

print("Clinical-T5 Scratch model loaded successfully!")

Clinical-T5 Scratch model loaded successfully!


#### T5 from scratch

In [56]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Example text (replace with your combined_text)
text = df['combined_text'][0]

# Prepare the text
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)

# Generate summary
summary_ids = model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)

# Decode the summary
summary_t5scratch = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(f"Summary: {summary_t5scratch}")


Summary: eval for infection Chest PA and lateral None. There is no focal consolidation, pleural effusion or pneumothorax. Bilateral No acute cardiopulmonary process.


In [57]:
# Compute readability scores for the summary
fk_score = textstat.flesch_kincaid_grade(summary_t5scratch)
smog_score = textstat.smog_index(summary_t5scratch)

print("Flesch-Kincaid Grade Level:", fk_score)
print("SMOG Index:", smog_score)


Flesch-Kincaid Grade Level: 13.2
SMOG Index: 11.9


#### Small fine-tuned T5

In [60]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("dlyog/t5-small-finetuned")
tokenizer = T5Tokenizer.from_pretrained("dlyog/t5-small-finetuned")

def summarize(text):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    summary_ids = model.generate(input_ids)
    summary = tokenizer.decode(summary_ids[0])
    return summary

# Example usage
summary = summarize(df['combined_text'][0])

In [61]:
# Compute readability scores for the summary
fk_score = textstat.flesch_kincaid_grade(summary_t5scratch)
smog_score = textstat.smog_index(summary_t5scratch)

print("Flesch-Kincaid Grade Level:", fk_score)
print("SMOG Index:", smog_score)


Flesch-Kincaid Grade Level: 13.2
SMOG Index: 11.9
