# Setup Development Environment

In [1]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets
!pip install transformers
!pip install -q transformers==4.37.2
!pip install torch
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle
import csv
import tensorflow as tf
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration, AutoTokenizer, AutoModel
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration

import pandas as pd
import torch
import gc  # For garbage collection

from pprint import pprint  # Makes output readable without horizontal scrolling

In [None]:
!pip install -q evaluate
import evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


# Load and prepare the dataset

In [3]:
# Load the dataset
dataset = load_dataset("allenai/mslr2022", "ms2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/260M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14188 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1667 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2021 [00:00<?, ? examples/s]

In [4]:
train_dataset = load_dataset("allenai/mslr2022", "ms2", split='train')
test_dataset = load_dataset("allenai/mslr2022", "ms2", split='test')
validation_dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')

In [5]:
tr_df = pd.DataFrame(train_dataset)
val_df = pd.DataFrame(validation_dataset)
tst_df = pd.DataFrame(test_dataset)

# DAE

In [6]:
print(len(tr_df))
print(len(val_df))
print(len(tst_df))

14188
2021
1667


In [7]:
x=14188+2021+1667
print(x)
train_dataset.column_names

17876


['review_id', 'pmid', 'title', 'abstract', 'target', 'background']

In [8]:
max_length_abstract = tr_df['abstract'].astype(str).apply(len).max()
avg_length_abstract = tr_df['abstract'].astype(str).apply(len).mean()

# Calculate maximum and average lengths for 'target'
max_length_target = tr_df['target'].astype(str).apply(len).max()
avg_length_target = tr_df['target'].astype(str).apply(len).mean()

max_length_back = tr_df['background'].astype(str).apply(len).max()
avg_length_back = tr_df['background'].astype(str).apply(len).mean()
# Print the results
print(f"Max length of 'abstract': {max_length_abstract}")
print(f"Average length of 'abstract': {avg_length_abstract}")
print('\n')
print(f"Max length of 'target': {max_length_target}")
print(f"Average length of 'target': {avg_length_target}")
print('\n')
print(f"Max length of 'background': {max_length_back}")
print(f"Average length of 'background': {avg_length_back}")

Max length of 'abstract': 731139
Average length of 'abstract': 39910.700380603324


Max length of 'target': 3677
Average length of 'target': 391.12905272060897


Max length of 'background': 5351
Average length of 'background': 454.85755568085705


In [9]:
#print(tr_df['abstract'].max)
tr_df['abstract_length'] = tr_df['abstract'].astype(str).apply(len)

# Find the row with the maximum length in the 'abstract' column
max_length_row = tr_df.loc[tr_df['abstract_length'].idxmax()]

# Print out the longest abstract and its length
print("The longest abstract is:", max_length_row['abstract'])
print('\n')
print("Length of the longest abstract:", max_length_row['abstract_length'])

The longest abstract is: ['Background Blunt head trauma is a common cause of death and disability in children worldwide . Cranial computed tomography ( CT ) , the reference st and ard for the diagnosis of traumatic brain injury ( TBI ) , exposes children to ionizing radiation which has been linked to the development of brain tumors , leukemia , and other cancers . We describe the methods used to develop and test the effectiveness of a decision aid to facilitate shared decision-making with parents regarding whether to obtain a head CT scan or to further observe their child at home . Methods / Design This is a protocol for a multicenter clinician-level parallel r and omized trial to compare an intervention group receiving a decision aid , ‘ Head CT Choice ’ , to a control group receiving usual care . The trial will be conducted at five diverse emergency departments ( EDs ) in Minnesota and California . Clinicians will be r and omized to decision aid or usual care . Parents visiting the E

# Baseline Model: Pegasus

In [None]:
# Check available devices and inform the user
if tf.config.list_physical_devices('GPU'):
    print("Using CUDA device.")
else:
    print("CUDA not available. Using CPU.")

# Initialize the Pegasus tokenizer and model for TensorFlow
ptokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
pmodel = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

# tr_df contains the text to be summarized
inputs = ptokenizer(tr_df['abstract'].astype(str).tolist(), max_length=1024,
                   truncation=True, padding="max_length", return_tensors="tf")

# Convert tokenized data to a format suitable for saving
# Here we're converting the input_ids and attention_mask to lists of integers for easy CSV writing
# Note: .numpy() converts TensorFlow tensors to numpy arrays
input_ids = inputs['input_ids'].numpy().tolist()
attention_masks = inputs['attention_mask'].numpy().tolist()

# Create a new DataFrame to store tokenized data
tokenized_df = pd.DataFrame({
    'input_ids': [' '.join(map(str, input_id)) for input_id in input_ids],
    'attention_masks': [' '.join(map(str, mask)) for mask in attention_masks]
})

# Save the DataFrame to a CSV file
tokenized_df.to_csv('drive/MyDrive/pega-t5/tokenized_data.csv', index=False)

Using CUDA device.


tf_model.h5:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFPegasusForConditionalGeneration.

Some layers of TFPegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['final_logits_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [None]:
# Use this to load from saved CSV.
summaries_pega_tr_df = pd.read_csv('drive/MyDrive/pega-t5/Pegasus_train-prediction_tf.csv')

In [None]:
# Define the batch size and initialize the results list
batch_size = 32
results = []

# Generate summaries in batches
print(len(inputs['input_ids']))
for i in range(0, len(inputs['input_ids']), batch_size):
    print(i)
    input_ids_batch = inputs['input_ids'][i:i+batch_size]
    review_ids_batch = tr_df['review_id'][i:i+batch_size].tolist()
    summary_ids = pmodel.generate(input_ids_batch,
                                 num_beams=2,
                                 no_repeat_ngram_size=2,
                                 min_length=10,
                                 max_length=512,
                                 early_stopping=True)

    batch_summaries = ptokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    #batch_summaries = [ptokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    for review_id, summary in zip(review_ids_batch, batch_summaries):
        results.append({'review_id': review_id, 'Summary': summary})

# Convert the results to a DataFrame and save to a CSV file
summaries_pega_tr_df = pd.DataFrame(results)
output_file_tr = 'Pegasus_train-prediction_tf.csv'
summaries_pega_tr_df.to_csv('drive/MyDrive/pega-t5/'+output_file_tr, index=False)
print(f"Saved summaries to {output_file_tr}")

# Rouge Testing: Pegasus

In [None]:
# Use this to load from saved CSV.

#summaries_pega_tr_df = pd.read_csv('drive/MyDrive/pega-t5/Pegasus_train-prediction_tf.csv')

In [None]:
rouge = evaluate.load('rouge')
predictions = summaries_pega_tr_df['Summary']
references = tr_df['target']
rouge_results = rouge.compute(predictions=predictions,
                        references=references)
print(rouge_results)

{'rouge1': 0.1739883726903796, 'rouge2': 0.019690773116985447, 'rougeL': 0.127040990614671, 'rougeLsum': 0.1409497383867967}


# Baseline Model: T5

In [None]:
# Load T5 model and tokenizer
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

t5model.summary()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
Total params: 222903552 (850.31 MB)
Trainable params: 222903552 (850.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# tr_df contains the text to be summarized
inputs = t5tokenizer(tr_df['abstract'].astype(str).tolist(), max_length=1024,
                   truncation=True, padding="max_length", return_tensors="tf")

In [None]:
# Use this to load from checkpoint for model testing below. Also need to change start point below and comment out t5_results.

import pandas as pd
#summaries_tr_df_t5 = pd.read_csv('drive/MyDrive/pega-t5/T5_train-prediction_tf13440.csv')
#t5_results = summaries_tr_df_t5.to_dict(orient='records')
#len(t5_results)

In [None]:
# Use this to load from saved CSV.

#summaries_tr_df_t5 = pd.read_csv('drive/MyDrive/pega-t5/T5_train-prediction_tf.csv')

In [None]:
# Define the batch size and initialize the results list
batch_size = 32
#t5_results = []

# Generate summaries in batches
for i in range(13440, len(inputs['input_ids']), batch_size):
    if i % 960 == 0:
        summaries_tr_df_t5 = pd.DataFrame(t5_results)
        output_file_tr = f'drive/MyDrive/pega-t5/T5_train-prediction_tf{i}.csv'
        summaries_tr_df_t5.to_csv(output_file_tr, index=False)
    print(i)
    input_ids_batch = inputs['input_ids'][i:i+batch_size]
    attention_mask_batch = inputs['attention_mask'][i:i+batch_size]
    review_ids_batch = tr_df['review_id'][i:i+batch_size].tolist()
    summary_ids = t5model.generate(input_ids_batch,
                                  attention_mask=attention_mask_batch,
                                  num_beams=2,
                                  no_repeat_ngram_size=2,
                                  min_length=10,
                                  max_length=512,
                                  early_stopping=True)

    batch_summaries = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    for review_id, summary in zip(review_ids_batch, batch_summaries):
        t5_results.append({'review_id': review_id, 'Summary': summary})

# Convert the results to a DataFrame and save to a CSV file
summaries_tr_df_t5 = pd.DataFrame(t5_results)
output_file_tr = 'drive/MyDrive/pega-t5/T5_train-prediction_tf.csv'
summaries_tr_df_t5.to_csv(output_file_tr, index=False)
print(f"Saved summaries to {output_file_tr}")

# Rouge Testing: T5

In [None]:
rouge = evaluate.load('rouge')
predictions = summaries_tr_df_t5['Summary']
references = tr_df['target']
rouge_results_t5 = rouge.compute(predictions=predictions,
                        references=references)
print(rouge_results_t5)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.17038833695850014, 'rouge2': 0.0171524182758342, 'rougeL': 0.10866151595889414, 'rougeLsum': 0.1263354911004681}


# BLEURT Evaluation

In [None]:
!pip install git+https://github.com/google-research/bleurt.git
from datasets import load_metric

# Load BLEURT from datasets
bleurt = load_metric('bleurt')

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-j_k90wzi
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-j_k90wzi
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456763 sha256=e6f36081f7e08ce8a255cf83810f8bb2e96d92d58e31f2bd2fdb0a6bc53ca167
  Stored in directory: /tmp/pip-ephem-wheel-cache-uh5zj9cd/wheels/64/f4/2c/509a6c31b8ebde891a81029fd94f199b1b92f0e7cfc20d417a
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2


ModuleNotFoundError: No module named 'datasets'

In [None]:
predictions = summaries_pega_tr_df['Summary']
references = tr_df['target']
# Compute BLEURT scores
bleurt_results = bleurt.compute(predictions=predictions, references=references)

# Print BLEURT scores
print('bleurt_results for Pegasus baseline:', bleurt_results)
print('Avg BLEURT Score:', str(sum(bleurt_results['scores'])/len(bleurt_results['scores'])))

bleurt_results for Pegasus baseline: {'scores': [-0.7313346862792969, -0.8204416632652283, -0.3612985610961914, -0.4866044819355011, -0.7147544026374817, -0.8730676770210266, -0.2171754240989685, -0.7800220251083374, -0.69108647108078, -0.7898827195167542, -0.5891683101654053, -0.9123894572257996, -0.4310087561607361, -0.7453576922416687, -0.9146400094032288, -0.7793182134628296, -0.49654555320739746, -0.5047228336334229, -0.695687472820282, -0.6109880208969116, -0.8213273882865906, -0.4527285695075989, -0.20300684869289398, -0.8349809646606445, -0.5100201368331909, -0.8987551331520081, -0.5970681309700012, -0.7036880850791931, -0.9837960600852966, -0.5641428828239441, -0.9462850093841553, -0.27968311309814453, -0.43080848455429077, -0.7756741046905518, -1.0356309413909912, -1.2010271549224854, -0.8412328362464905, -0.7645872235298157, -0.9078554511070251, -0.8107470273971558, -0.966853141784668, -0.8236139416694641, -0.9518963694572449, -0.41003262996673584, -0.9115213751792908, -0.56

In [None]:
predictions = summaries_tr_df_t5['Summary']
references = tr_df['target']

# Compute BLEURT scores
bleurt_results = bleurt.compute(predictions=predictions, references=references)

# Print BLEURT scores
print('bleurt_results for T5 baseline:', bleurt_results)
print('Avg BLEURT Score:', str(sum(bleurt_results['scores'])/len(bleurt_results['scores'])))

bleurt_results for T5 baseline: {'scores': [-1.2006129026412964, -0.9435255527496338, -0.936352550983429, -1.414511799812317, -0.831432580947876, -0.6439738273620605, -1.1162464618682861, -0.9291288256645203, -0.8656831383705139, -0.903872549533844, -0.9404348134994507, -1.0136876106262207, -0.7697102427482605, -1.008315920829773, -0.8365209102630615, -1.4814242124557495, -1.1927258968353271, -1.1031627655029297, -0.6249096989631653, -0.6617904901504517, -0.8950232863426208, -0.9987726211547852, -0.875510573387146, -0.885843813419342, -0.9808732867240906, -1.2499866485595703, -1.3744910955429077, -0.7963483929634094, -0.8548470139503479, -1.2401090860366821, -1.0337426662445068, -0.8856325745582581, -0.525475025177002, -0.9975308179855347, -1.2266862392425537, -0.9408525824546814, -1.2257403135299683, -1.1469807624816895, -1.0063204765319824, -1.024245262145996, -1.134507656097412, -1.1367124319076538, -0.9774568676948547, -1.259407639503479, -0.9057765603065491, -0.7970035672187805, -