In [None]:
!pip install transformers
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


# Import libraries and dataframe

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from google.colab import drive
drive.mount('/content/drive')
import re
import matplotlib.pyplot as plt

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/TMS_project/')

In [None]:
data = pd.read_csv("mtsamples.csv", encoding= 'utf-8')

In [None]:
# Reduce dataframe to columns of interest for our task
data = data[['description', 'transcription']]

In [None]:
# Rename columns for clarity
data = data.rename(columns = {'description' : 'summary', 'transcription': 'text'})

In [None]:
data.head()

Unnamed: 0,summary,text
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr..."
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb..."
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ..."
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit..."
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...


# Exploratory Analysis

In [None]:
# Find and drop rows that contain NaN values
print(data.isna().sum())
data = data.dropna()
print(data.shape)

summary     0
text       33
dtype: int64
(4966, 2)


In [None]:
# Find distribuition of the number of words in text column
data["count_words"] = data["text"].apply(lambda n: len(str(n).split(" ")))
data[["text","count_words"]].head()

data["count_words"].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["count_words"] = data["text"].apply(lambda n: len(str(n).split(" ")))


count    4966.000000
mean      495.906162
std       333.829275
min         1.000000
25%       258.000000
50%       425.000000
75%       656.000000
max      3032.000000
Name: count_words, dtype: float64

In [None]:
# Find distribuition of the number of words in summary column
data["count_words_s"] = data["summary"].apply(lambda n: len(str(n).split(" ")))
data[["summary","count_words_s"]].head()

data["count_words_s"].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["count_words_s"] = data["summary"].apply(lambda n: len(str(n).split(" ")))


count    4966.000000
mean       20.482682
std        12.719992
min         2.000000
25%        10.000000
50%        17.000000
75%        28.000000
max        80.000000
Name: count_words_s, dtype: float64

As we can see a Summary has an avarege of 20 words, we so choose to delete those texts which have 20 or lower words

In [None]:
# Remove text with < 20 words
data = data[data["text"].str.split(' ').apply(len) > 20]

In [None]:
data.shape

(4906, 4)

# Text pre - processing

In [None]:
data

Unnamed: 0,summary,text,count_words,count_words_s
0,A 23-year-old white female presents with comp...,"SUBJECTIVE:, This 23-year-old white female pr...",226,10
1,Consult for laparoscopic gastric bypass.,"PAST MEDICAL HISTORY:, He has difficulty climb...",375,6
2,Consult for laparoscopic gastric bypass.,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...",774,6
3,2-D M-Mode. Doppler.,"2-D M-MODE: , ,1. Left atrial enlargement wit...",77,6
4,2-D Echocardiogram,1. The left ventricular cavity size and wall ...,246,3
...,...,...,...,...
4994,Patient having severe sinusitis about two to ...,"HISTORY:, I had the pleasure of meeting and e...",840,22
4995,This is a 14-month-old baby boy Caucasian who...,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...",282,42
4996,A female for a complete physical and follow u...,"SUBJECTIVE: , This is a 42-year-old white fema...",787,15
4997,Mother states he has been wheezing and coughing.,"CHIEF COMPLAINT: , This 5-year-old male presen...",426,9


In [None]:
import regex as re
import string
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def character_repeatation(text):
    # Pattern matching for all case alphabets
    # \1   It refers to the first capturing group.
    # {1,} It means we are matching for repetition that occurs more than one time.
    # r’\1\1' → It limits all the repetition to two characters.
    Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)
    # Limiting all the  repeatation to two characters.
    Formatted_text = Pattern_alpha.sub(r"\1\1", text)
    # Pattern matching for all the punctuations that can occur
    Pattern_Punct = re.compile(r'([,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
    # Limiting punctuations in previously formatted string to only one.
    Combined_Formatted = Pattern_Punct.sub(r'\1', Formatted_text)
    return Combined_Formatted

In [None]:
# text normalization
#data['text']= data['text'].apply(lambda x: remove_punctuation(x))
data['text']= data['text'].apply(lambda x: character_repeatation(x))
data['text']= data['text'].apply(lambda x: x.lower())

# summary normalization
data['summary']= data['summary'].apply(lambda x: remove_punctuation(x))
data['summary']= data['summary'].apply(lambda x: character_repeatation(x))
data['summary']= data['summary'].apply(lambda x: x.lower())


In [None]:
data.head()

Unnamed: 0,summary,text,count_words,count_words_s
0,a 23yearold white female presents with compla...,"subjective:, this 23-year-old white female pr...",226,10
1,consult for laparoscopic gastric bypass,"past medical history:, he has difficulty climb...",375,6
2,consult for laparoscopic gastric bypass,"history of present illness: , i have seen abc ...",774,6
3,2d mmode doppler,"2-d m-mode: , ,1. left atrial enlargement wit...",77,6
4,2d echocardiogram,1. the left ventricular cavity size and wall ...,246,3


# Summarization model (extractive)

In [None]:
pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m61.4/97.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-23.12.11-py3-none-any.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: breadabi

In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

In [None]:
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## LEXRANK

In [None]:
# Function to apply LexRank summarization to each row of the DataFrame
def apply_lexrank(row):
    parser = PlaintextParser.from_string(row['text'], Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, sentences_count=1)  # You can adjust the sentence count as needed
    summarized_text = ' '.join([str(sentence) for sentence in summary])
    return summarized_text

# Applying LexRank summarization and storing results in a new column
data['lexRank_summary'] = data.apply(apply_lexrank, axis=1)

# Displaying the updated DataFrame
print(data)

                                                summary  \
0      a 23yearold white female presents with compla...   
1               consult for laparoscopic gastric bypass   
2               consult for laparoscopic gastric bypass   
3                                    2d mmode doppler     
4                                     2d echocardiogram   
...                                                 ...   
4994   patient having severe sinusitis about two to ...   
4995   this is a 14monthold baby boy caucasian who c...   
4996   a female for a complete physical and follow u...   
4997    mother states he has been wheezing and coughing   
4998   acute allergic reaction etiology uncertain ho...   

                                                   text  count_words  \
0     subjective:,  this 23-year-old white female pr...          226   
1     past medical history:, he has difficulty climb...          375   
2     history of present illness: , i have seen abc ...          774   
3  

## TEXTRANK

In [None]:
# Function to apply TexRank summarization to each row of the DataFrame
def apply_textrank(row):
    parser = PlaintextParser.from_string(row['text'], Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentences_count=1)  # You can adjust the sentence count as needed
    summarized_text = ' '.join([str(sentence) for sentence in summary])
    return summarized_text

# Applying TextRank summarization and storing results in a new column
data['textRank_summary'] = data.apply(apply_textrank, axis=1)

# Displaying the updated DataFrame
print(data)

                                                summary  \
0      a 23yearold white female presents with compla...   
1               consult for laparoscopic gastric bypass   
2               consult for laparoscopic gastric bypass   
3                                    2d mmode doppler     
4                                     2d echocardiogram   
...                                                 ...   
4994   patient having severe sinusitis about two to ...   
4995   this is a 14monthold baby boy caucasian who c...   
4996   a female for a complete physical and follow u...   
4997    mother states he has been wheezing and coughing   
4998   acute allergic reaction etiology uncertain ho...   

                                                   text  count_words  \
0     subjective:,  this 23-year-old white female pr...          226   
1     past medical history:, he has difficulty climb...          375   
2     history of present illness: , i have seen abc ...          774   
3  

## LUHN

In [None]:
# Function to apply luhn summarization to each row of the DataFrame
def luhn(row):
    parser = PlaintextParser.from_string(row['text'], Tokenizer("english"))
    summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, sentences_count=1)  # You can adjust the sentence count as needed
    summarized_text = ' '.join([str(sentence) for sentence in summary])
    return summarized_text

# Applying luhn summarization and storing results in a new column
data['luhn_summary'] = data.apply(luhn, axis=1)

# Displaying the updated DataFrame
print(data)

                                                summary  \
0      a 23yearold white female presents with compla...   
1               consult for laparoscopic gastric bypass   
2               consult for laparoscopic gastric bypass   
3                                    2d mmode doppler     
4                                     2d echocardiogram   
...                                                 ...   
4994   patient having severe sinusitis about two to ...   
4995   this is a 14monthold baby boy caucasian who c...   
4996   a female for a complete physical and follow u...   
4997    mother states he has been wheezing and coughing   
4998   acute allergic reaction etiology uncertain ho...   

                                                   text  count_words  \
0     subjective:,  this 23-year-old white female pr...          226   
1     past medical history:, he has difficulty climb...          375   
2     history of present illness: , i have seen abc ...          774   
3  

# Summarization model (abstractive)

## GPT

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# import torch
# from transformers import GPT2Tokenizer, GPT2LMHeadModel
# from tqdm import tqdm

# # Check if GPU is available and set device accordingly
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Load tokenizer and pre-trained GPT-2 model onto the chosen device
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# def generate_summary(text):
#     input_text = "summarize: " + text
#       # Limita la lunghezza massima del testo
#     max_text_length = 512
#     if len(text) > max_text_length:
#         text = text[:max_text_length]
#     # Tokenize the text
#     input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=max_text_length, truncation=True).to(device)

#     # Ensure the attention_mask is correctly configured
#     attention_mask = torch.ones_like(input_ids).to(device)

#     # Generate a summary using the GPT model with approximate token count
#     summary_ids = model.generate(input_ids, max_length=576, num_beams=2, length_penalty=2.0, early_stopping=True, attention_mask =attention_mask )

#     # Decode the summary
#     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

#     return summary



# # Generate summaries for each row in the 'text' column
# tqdm.pandas(desc="Generazione riassunti GPT")
# data['summary_gpt'] = data['text'].progress_apply(generate_summary)




In [None]:
data['text']= data['text'].apply(lambda x: remove_punctuation(x))

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and pre-trained GPT2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Loadbar
progress_bar = tqdm(total=len(data), desc="Summarization Progress")

# 'summary_gpt' new column
data['summary_gpt'] = ''

# Perform summarizaion on dataset
for index, row in data.iterrows():
    text = row['text']

    # Fix max length of text
    max_text_length = 512
    if len(text) > max_text_length:
        text = text[:max_text_length]

    # Tokanization
    input_ids = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True).to(device)

    # Check on attention_mask
    attention_mask = torch.ones_like(input_ids)

    # Generate summary
    summary_ids = model.generate(input_ids, max_length = 150,max_new_tokens = 20,num_beams=2, length_penalty=2.0, early_stopping=True, attention_mask=attention_mask).to(device)

    # Decoder
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Store new summary
    data.loc[index, 'summary_gpt'] = summary

    # LoadBar
    progress_bar.update(1)


progress_bar.close()


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Summarization Progress:  49%|████▉     | 2407/4906 [12:30<12:22,  3.36it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=20) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Summarization Progress:  49%|████▉     | 2408/4906 [12:30<12:34,  3.31it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=20) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Summarization Progress:  49%|████▉     | 2409/4906 [12:31<12:25,  3.35it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generatio

In [None]:
data.to_csv('data_gpt.csv')
!cp data_gpt.csv '/content/drive/MyDrive/TMS_project/Summaries'

cp: 'data_gpt.csv' and '/content/drive/MyDrive/TMS_project/data_gpt.csv' are the same file


## TXTAI

In [None]:
pip install txtai

Collecting txtai
  Downloading txtai-6.3.0-py3-none-any.whl (205 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.2/205.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu>=1.7.1.post2 (from txtai)
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu, txtai
Successfully installed faiss-cpu-1.7.4 txtai-6.3.0


In [None]:
import txtai.pipeline
import txtai

In [None]:
from txtai.pipeline import Summary

# Create and run pipeline
summary = Summary()

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
# Function to perform summarization
def perform_summarization(text):
    # Perform text summarization
    txtai_summary = summary(text, maxlength = 20)
    return txtai_summary

# Apply summarization function to the 'text' column and store results in a new column 'summarized_text'
data['txtai_summary'] = data['text'].apply(perform_summarization)



In [None]:
data

Unnamed: 0,summary,text,count_words,count_words_s,lexRank_summary,textRank_summary,luhn_summary,summary_gpt,txtai_summary
0,a 23yearold white female presents with compla...,subjective this 23yearold white female presen...,226,10,it does not appear to be working very well.,she does have asthma but doest not require dai...,she does have asthma but doest not require dai...,subjective this 23yearold white female presen...,23-year-old white female presents with complai...
1,consult for laparoscopic gastric bypass,past medical history he has difficulty climbin...,375,6,he now smokes less than three cigarettes a day...,denies obesity and hypertension in other famil...,denies obesity and hypertension in other famil...,past medical history he has difficulty climbin...,He exercises three times a week at home and do...
2,consult for laparoscopic gastric bypass,history of present illness i have seen abc to...,774,6,"he is 5'9"".",he has a bmi of 51. he has been overweight fo...,he has a bmi of 51. he has been overweight fo...,history of present illness i have seen abc to...,He has been overweight for ten years since the...
3,2d mmode doppler,2d mmode 1 left atrial enlargement with left...,77,6,"normal morphology of aortic valve, mitral valv...","normal morphology of aortic valve, mitral valv...","normal morphology of aortic valve, mitral valv...",2d mmode 1 left atrial enlargement with left...,Left atrial enlargement with left atrial diame...
4,2d echocardiogram,1 the left ventricular cavity size and wall t...,246,3,the aortic valve appears calcified with mild a...,the aortic valve appears calcified with mild a...,the aortic valve appears calcified with mild a...,1 the left ventricular cavity size and wall t...,The study was somewhat technically limited and...
...,...,...,...,...,...,...,...,...,...
4994,patient having severe sinusitis about two to ...,history i had the pleasure of meeting and eva...,840,22,she also has noted that she is having some pro...,in light of the patient's atypical dizziness s...,in light of the patient's atypical dizziness s...,history i had the pleasure of meeting and eva...,The patient is a pleasant 50yearold female who...
4995,this is a 14monthold baby boy caucasian who c...,admitting diagnosis kawasaki diseasedischarge...,282,42,; so with a very close followup and a cardiac ...,"when he was sent to the hospital, he had a fev...","when he was sent to the hospital, he had a fev...",admitting diagnosis kawasaki diseasedischarge...,14month-old baby boy caucasian with presumptiv...
4996,a female for a complete physical and follow u...,subjective this is a 42yearold white female w...,787,15,she is to call me if she is not improving.,she also notes that in the past she was on adv...,she also notes that in the past she was on adv...,subjective this is a 42yearold white female w...,Asthma seems to be worse than in the past and ...
4997,mother states he has been wheezing and coughing,chief complaint this 5yearold male presents t...,426,9,his peak flows on the morning are normal at 15...,also inclusive of frequent pneumonia by report...,his peak flows on the morning are normal at 15...,chief complaint this 5yearold male presents t...,5-year-old male presents to childrens hospital...


In [None]:
data.to_csv('data_txtai.csv')
!cp data_txtai.csv '/content/drive/MyDrive/TMS_project/Summaries'

cp: 'data_txtai.csv' and '/content/drive/MyDrive/TMS_project/data_txtai.csv' are the same file


## T5

In [None]:
# data = pd.read_csv("data_txtai.csv", encoding= 'utf-8')

In [None]:
import sentencepiece
import transformers
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [None]:
import torch
import locale


In [None]:
from tqdm.notebook import tqdm

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
tokenizer_t5 = T5Tokenizer.from_pretrained("t5-base")
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-base")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
model_t5 = model_t5.to(device)

In [None]:
from tqdm.notebook import tqdm

# Funzione per generare il riassunto utilizzando T5 con tqdm
def generate_summary_t5_with_progress(text):
    input_text = "summarize: " + text
    inputs = tokenizer_t5(input_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model_t5.generate(**inputs, max_length = 20)
    summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Applica la funzione a ogni riga nella colonna 'transcription' del DataFrame con tqdm
tqdm.pandas(desc="Generazione riassunti T5")
data['summary_t5'] = data['text'].progress_apply(generate_summary_t5_with_progress)

locale.getpreferredencoding = lambda: "UTF-8"

data.to_csv('data_t5.csv')
!cp data_t5.csv '/content/drive/MyDrive/TMS_project/Summaries'




Generazione riassunti T5:   0%|          | 0/4906 [00:00<?, ?it/s]

cp: 'data_t5.csv' and '/content/drive/MyDrive/TMS_project/data_t5.csv' are the same file


In [None]:
data

Unnamed: 0,summary,text,count_words,count_words_s,lexRank_summary,textRank_summary,luhn_summary,summary_gpt,txtai_summary,summary_t5
0,a 23yearold white female presents with compla...,subjective this 23yearold white female presen...,226,10,it does not appear to be working very well.,she does have asthma but doest not require dai...,she does have asthma but doest not require dai...,subjective this 23yearold white female presen...,23-year-old white female presents with complai...,23-year-old seattle woman has complained of al...
1,consult for laparoscopic gastric bypass,past medical history he has difficulty climbin...,375,6,he now smokes less than three cigarettes a day...,denies obesity and hypertension in other famil...,denies obesity and hypertension in other famil...,past medical history he has difficulty climbin...,He exercises three times a week at home and do...,he has a history of heart disease in both gran...
2,consult for laparoscopic gastric bypass,history of present illness i have seen abc to...,774,6,"he is 5'9"".",he has a bmi of 51. he has been overweight fo...,he has a bmi of 51. he has been overweight fo...,history of present illness i have seen abc to...,He has been overweight for ten years since the...,he is 42 years old and weighs 344 pounds and 5...
3,2d mmode doppler,2d mmode 1 left atrial enlargement with left...,77,6,"normal morphology of aortic valve, mitral valv...","normal morphology of aortic valve, mitral valv...","normal morphology of aortic valve, mitral valv...",2d mmode 1 left atrial enlargement with left...,Left atrial enlargement with left atrial diame...,2d mmode 1 left atrial enlargement with left a...
4,2d echocardiogram,1 the left ventricular cavity size and wall t...,246,3,the aortic valve appears calcified with mild a...,the aortic valve appears calcified with mild a...,the aortic valve appears calcified with mild a...,1 the left ventricular cavity size and wall t...,The study was somewhat technically limited and...,the left ventricular cavity size and wall thic...
...,...,...,...,...,...,...,...,...,...,...
4994,patient having severe sinusitis about two to ...,history i had the pleasure of meeting and eva...,840,22,she also has noted that she is having some pro...,in light of the patient's atypical dizziness s...,in light of the patient's atypical dizziness s...,history i had the pleasure of meeting and eva...,The patient is a pleasant 50yearold female who...,the patient has been referred for evaluation a...
4995,this is a 14monthold baby boy caucasian who c...,admitting diagnosis kawasaki diseasedischarge...,282,42,; so with a very close followup and a cardiac ...,"when he was sent to the hospital, he had a fev...","when he was sent to the hospital, he had a fev...",admitting diagnosis kawasaki diseasedischarge...,14month-old baby boy caucasian with presumptiv...,14monthold boy came in with presumptive diagno...
4996,a female for a complete physical and follow u...,subjective this is a 42yearold white female w...,787,15,she is to call me if she is not improving.,she also notes that in the past she was on adv...,she also notes that in the past she was on adv...,subjective this is a 42yearold white female w...,Asthma seems to be worse than in the past and ...,a 42yearold white female who comes in today fo...
4997,mother states he has been wheezing and coughing,chief complaint this 5yearold male presents t...,426,9,his peak flows on the morning are normal at 15...,also inclusive of frequent pneumonia by report...,his peak flows on the morning are normal at 15...,chief complaint this 5yearold male presents t...,5-year-old male presents to childrens hospital...,5year old male presents to childrens hospital ...


In [None]:
data.to_csv('data_summarized.csv')
!cp data_summarized.csv '/content/drive/MyDrive/TMS_project/Summaries'

cp: 'data_summarized.csv' and '/content/drive/MyDrive/TMS_project/data_summarized.csv' are the same file


## Evaluation - ROUGE

The following five evaluation metrics are available.

* ROUGE-N: Overlap of n-grams between the system and reference summaries.
* ROUGE-1 refers to the overlap of unigrams (each word) between the system and reference summaries.
* ROUGE-2 refers to the overlap of bigrams between the system and reference summaries.
* ROUGE-L: Longest Common Subsequence (LCS) based statistics. Longest common subsequence problem takes into account sentence-level structure similarity naturally and identifies longest co-occurring in sequence n-grams automatically.



Note: "f" stands for f1_score, "p" stands for precision, "r" stands for recall.

In [None]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge


rouge = Rouge()
scores_lexRank = rouge.get_scores(data['lexRank_summary'], data['summary'], avg=True)
scores_textRank = rouge.get_scores(data['textRank_summary'], data['summary'], avg=True)
scores_luhn = rouge.get_scores(data['luhn_summary'], data['summary'], avg=True)
scores_gpt = rouge.get_scores(data['summary_gpt'], data['summary'], avg=True)
scores_txtai = rouge.get_scores(data['txtai_summary'], data['summary'], avg=True)
scores_t5 = rouge.get_scores(data['summary_t5'], data['summary'], avg=True)

In [None]:
from tabulate import tabulate


headers = ["Method", "ROUGE-1 (P)", "ROUGE-1 (R)", "ROUGE-1 (F)",
           "ROUGE-2 (P)", "ROUGE-2 (R)", "ROUGE-2 (F)",
           "ROUGE-L (P)", "ROUGE-L (R)", "ROUGE-L (F)"]

data_rows = [
    ["LexRank",
     f"{scores_lexRank['rouge-1']['p']:.4f}", f"{scores_lexRank['rouge-1']['r']:.4f}", f"{scores_lexRank['rouge-1']['f']:.4f}",
     f"{scores_lexRank['rouge-2']['p']:.4f}", f"{scores_lexRank['rouge-2']['r']:.4f}", f"{scores_lexRank['rouge-2']['f']:.4f}",
     f"{scores_lexRank['rouge-l']['p']:.4f}", f"{scores_lexRank['rouge-l']['r']:.4f}", f"{scores_lexRank['rouge-l']['f']:.4f}"],

    ["TextRank",
    f"{scores_textRank['rouge-1']['p']:.4f}", f"{scores_textRank['rouge-1']['r']:.4f}", f"{scores_textRank['rouge-1']['f']:.4f}",
    f"{scores_textRank['rouge-2']['p']:.4f}", f"{scores_textRank['rouge-2']['r']:.4f}", f"{scores_textRank['rouge-2']['f']:.4f}",
    f"{scores_textRank['rouge-l']['p']:.4f}", f"{scores_textRank['rouge-l']['r']:.4f}", f"{scores_textRank['rouge-l']['f']:.4f}"],

    ["Luhn",
    f"{scores_luhn['rouge-1']['p']:.4f}", f"{scores_luhn['rouge-1']['r']:.4f}", f"{scores_luhn['rouge-1']['f']:.4f}",
    f"{scores_luhn['rouge-2']['p']:.4f}", f"{scores_luhn['rouge-2']['r']:.4f}", f"{scores_luhn['rouge-2']['f']:.4f}",
    f"{scores_luhn['rouge-l']['p']:.4f}", f"{scores_luhn['rouge-l']['r']:.4f}", f"{scores_luhn['rouge-l']['f']:.4f}"],

    ["GPT",
     f"{scores_gpt['rouge-1']['p']:.4f}", f"{scores_gpt['rouge-1']['r']:.4f}", f"{scores_gpt['rouge-1']['f']:.4f}",
     f"{scores_gpt['rouge-2']['p']:.4f}", f"{scores_gpt['rouge-2']['r']:.4f}", f"{scores_gpt['rouge-2']['f']:.4f}",
     f"{scores_gpt['rouge-l']['p']:.4f}", f"{scores_gpt['rouge-l']['r']:.4f}", f"{scores_gpt['rouge-l']['f']:.4f}"],

    ["Txtai",
     f"{scores_txtai['rouge-1']['p']:.4f}", f"{scores_txtai['rouge-1']['r']:.4f}", f"{scores_txtai['rouge-1']['f']:.4f}",
     f"{scores_txtai['rouge-2']['p']:.4f}", f"{scores_txtai['rouge-2']['r']:.4f}", f"{scores_txtai['rouge-2']['f']:.4f}",
     f"{scores_txtai['rouge-l']['p']:.4f}", f"{scores_txtai['rouge-l']['r']:.4f}", f"{scores_txtai['rouge-l']['f']:.4f}"],

    ["T5",
     f"{scores_t5['rouge-1']['p']:.4f}", f"{scores_t5['rouge-1']['r']:.4f}", f"{scores_t5['rouge-1']['f']:.4f}",
     f"{scores_t5['rouge-2']['p']:.4f}", f"{scores_t5['rouge-2']['r']:.4f}", f"{scores_t5['rouge-2']['f']:.4f}",
     f"{scores_t5['rouge-l']['p']:.4f}", f"{scores_t5['rouge-l']['r']:.4f}", f"{scores_t5['rouge-l']['f']:.4f}"]
]

table = tabulate(data_rows, headers=headers, tablefmt="pretty")

print(table)


+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
|  Method  | ROUGE-1 (P) | ROUGE-1 (R) | ROUGE-1 (F) | ROUGE-2 (P) | ROUGE-2 (R) | ROUGE-2 (F) | ROUGE-L (P) | ROUGE-L (R) | ROUGE-L (F) |
+----------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+
| LexRank  |   0.1673    |   0.2734    |   0.1832    |   0.0662    |   0.1198    |   0.0734    |   0.1491    |   0.2491    |   0.1646    |
| TextRank |   0.1363    |   0.3186    |   0.1750    |   0.0531    |   0.1391    |   0.0692    |   0.1215    |   0.2914    |   0.1572    |
|   Luhn   |   0.1595    |   0.3551    |   0.2023    |   0.0733    |   0.1791    |   0.0937    |   0.1447    |   0.3291    |   0.1848    |
|   GPT    |   0.2250    |   0.7471    |   0.3271    |   0.1554    |   0.5914    |   0.2306    |   0.2099    |   0.6954    |   0.3049    |
|  Txtai   |   0.2730    | 

In [None]:
import pandas as pd
import altair as alt


data = {
    'Method': ['LexRank', 'TextRank', 'Luhn', 'GPT', 'Txtai', 'T5'],
    'ROUGE-1 (P)': [scores_lexRank['rouge-1']['p'], scores_textRank['rouge-1']['p'], scores_luhn['rouge-1']['p'], scores_gpt['rouge-1']['p'], scores_txtai['rouge-1']['p'], scores_t5['rouge-1']['p']],
    'ROUGE-1 (F)': [scores_lexRank['rouge-1']['f'], scores_textRank['rouge-1']['f'], scores_luhn['rouge-1']['f'], scores_gpt['rouge-1']['f'], scores_txtai['rouge-1']['f'], scores_t5['rouge-1']['f']],
    'ROUGE-2 (P)': [scores_lexRank['rouge-2']['p'], scores_textRank['rouge-2']['p'], scores_luhn['rouge-2']['p'], scores_gpt['rouge-2']['p'], scores_txtai['rouge-2']['p'], scores_t5['rouge-2']['p']],
    'ROUGE-2 (F)': [scores_lexRank['rouge-2']['f'], scores_textRank['rouge-2']['f'], scores_luhn['rouge-2']['f'], scores_gpt['rouge-2']['f'], scores_txtai['rouge-2']['f'], scores_t5['rouge-2']['f']],
    'ROUGE-L (P)': [scores_lexRank['rouge-l']['p'], scores_textRank['rouge-l']['p'], scores_luhn['rouge-l']['p'], scores_gpt['rouge-l']['p'], scores_txtai['rouge-l']['p'], scores_t5['rouge-l']['p']],
    'ROUGE-L (F)': [scores_lexRank['rouge-l']['f'], scores_textRank['rouge-l']['f'], scores_luhn['rouge-l']['f'], scores_gpt['rouge-l']['f'], scores_txtai['rouge-l']['f'], scores_t5['rouge-l']['f']]
}


df = pd.DataFrame(data)

# Melt the DataFrame for easier plotting
df_melted = df.melt('Method')

alt.Chart(df_melted).mark_bar().encode(
    x='Method',
    y='value',
    column='variable',
    color='variable'
).properties(
    title='ROUGE Scores Comparison (Precision & F1)',
    width=200,
    height=300
)
