In [1]:
from transformers import pipeline, PegasusForConditionalGeneration, PegasusTokenizer
import pandas as pd
from tqdm.auto import tqdm
import torch

In [2]:
df = pd.read_csv("/content/flu_data_summary_compare.csv")
df

Unnamed: 0,Year,Topic,Content,LexRank,TextRank
0,2006-2007,When and where did the 2006-07 flu season start?,The first report of regional flu activity came...,Regional flu activity is defined as increased ...,
1,2006-2007,How severe was the 2006-2007 flu season?,The 2006-07 flu season was generally mild comp...,"Flu activity increased during late December, p...","For example, the proportion of all deaths asso..."
2,2006-2007,What determines the severity of a flu season?,"The overall health impact (e.g., infections, h...",The severity of a flu season can be judged acc...,
3,2006-2007,Where did the most flu activity occur in the U...,Influenza viruses were identified in all state...,"From October 1, 2006 to May 19, 2007, widespre...",
4,2006-2007,When did the 2006-2007 flu season peak?,"During the 2006-2007 season, flu activity in t...",Although the timing of peak activity varies fr...,
...,...,...,...,...,...
136,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...,A couple of things are different for the 2023-...,
137,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ...",Flu vaccination is often available at no or lo...,Flu vaccination is often available at no or lo...
138,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...,CDC is not involved in regulatory decision-mak...,Quadrivalent flu vaccines protect against four...
139,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...,It is also possible to be sick with multiple f...,


In [3]:
# Load model and tokenizer
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# Check if a GPU is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model to the specified device
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

def summarization_pegasus(text):
    # Create a summarization pipeline with the loaded model and tokenizer
    # Ensure the pipeline uses the correct device
    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)

    # Truncate the input text if its length exceeds the maximum sequence length supported by the model
    max_input_length = tokenizer.model_max_length
    truncated_text = text[:max_input_length]

    # Calculate a reasonable max_length based on input length
    inputs = tokenizer(truncated_text, return_tensors="pt", truncation=True).to(device)  # Move tensor to the correct device
    input_length = inputs.input_ids.size(1)
    max_summary_length = max(10, input_length // 2)  # Adjust the divisor based on desired summary brevity

    # Generate summary
    summary = summarizer(truncated_text, max_length=max_summary_length, min_length=5)
    return summary[0]['summary_text']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tqdm.pandas(desc="Summarizing texts")
df["Pegasus"] = df["Content"].progress_apply(summarization_pegasus)

Summarizing texts:   0%|          | 0/141 [00:00<?, ?it/s]

In [21]:
df

Unnamed: 0,Year,Topic,Content,LexRank,TextRank,Pegasus
0,2006-2007,When and where did the 2006-07 flu season start?,The first report of regional flu activity came...,Regional flu activity is defined as increased ...,,Regional flu activity is defined as increased ...
1,2006-2007,How severe was the 2006-2007 flu season?,The 2006-07 flu season was generally mild comp...,"Flu activity increased during late December, p...","For example, the proportion of all deaths asso...","For example, the proportion of all deaths asso..."
2,2006-2007,What determines the severity of a flu season?,"The overall health impact (e.g., infections, h...",The severity of a flu season can be judged acc...,,and within each state;nThe proportion of influ...
3,2006-2007,Where did the most flu activity occur in the U...,Influenza viruses were identified in all state...,"From October 1, 2006 to May 19, 2007, widespre...",,"From October 1, 2006 to May 19, 2007, widespre..."
4,2006-2007,When did the 2006-2007 flu season peak?,"During the 2006-2007 season, flu activity in t...",Although the timing of peak activity varies fr...,,"During the past 31 years, flu activity in the ..."
...,...,...,...,...,...,...
136,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...,A couple of things are different for the 2023-...,,A couple of things are different for the
137,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ...",Flu vaccination is often available at no or lo...,Flu vaccination is often available at no or lo...,Although monitoring influenza-only coded death...
138,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...,CDC is not involved in regulatory decision-mak...,Quadrivalent flu vaccines protect against four...,Quadrivalent flu vaccines protect against four...
139,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...,It is also possible to be sick with multiple f...,,It is also possible to be sick with multiple f...


In [20]:
df.to_csv('/content/flu_data_summary_compare_pegasus.csv', index=False)

In [22]:
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
import torch

# Load model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)

# Check if a GPU is available and set the device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model to the specified device
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

def summarization_bart(text):
    # Create a summarization pipeline with the loaded model and tokenizer
    # Ensure the pipeline uses the correct device
    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)

    # Truncate the input text if its length exceeds the maximum sequence length supported by the model
    max_input_length = tokenizer.model_max_length
    truncated_text = text[:max_input_length]

    # Calculate a reasonable max_length based on input length
    inputs = tokenizer(truncated_text, return_tensors="pt", truncation=True).to(device)  # Move tensor to the correct device
    input_length = inputs.input_ids.size(1)
    max_summary_length = max(10, input_length // 2)  # Adjust the divisor based on desired summary brevity

    # Generate summary
    summary = summarizer(truncated_text, max_length=max_summary_length, min_length=5)
    return summary[0]['summary_text']

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [23]:
tqdm.pandas(desc="Summarizing texts")
df["Bart"] = df["Content"].progress_apply(summarization_bart)

Summarizing texts:   0%|          | 0/141 [00:00<?, ?it/s]

In [24]:
df

Unnamed: 0,Year,Topic,Content,LexRank,TextRank,Pegasus,Bart
0,2006-2007,When and where did the 2006-07 flu season start?,The first report of regional flu activity came...,Regional flu activity is defined as increased ...,,Regional flu activity is defined as increased ...,The first report of regional flu activity came...
1,2006-2007,How severe was the 2006-2007 flu season?,The 2006-07 flu season was generally mild comp...,"Flu activity increased during late December, p...","For example, the proportion of all deaths asso...","For example, the proportion of all deaths asso...",The 2006-07 flu season was generally mild comp...
2,2006-2007,What determines the severity of a flu season?,"The overall health impact (e.g., infections, h...",The severity of a flu season can be judged acc...,,and within each state;nThe proportion of influ...,"The overall health impact (e.g., infections, h..."
3,2006-2007,Where did the most flu activity occur in the U...,Influenza viruses were identified in all state...,"From October 1, 2006 to May 19, 2007, widespre...",,"From October 1, 2006 to May 19, 2007, widespre...","From October 1, 2006 to May 19, 2007, widespre..."
4,2006-2007,When did the 2006-2007 flu season peak?,"During the 2006-2007 season, flu activity in t...",Although the timing of peak activity varies fr...,,"During the past 31 years, flu activity in the ...","During the past 31 years, flu activity in the ..."
...,...,...,...,...,...,...,...
136,2023-2024,Updates to the Advisory Committee on Immunizat...,A couple of things are different for the 2023-...,A couple of things are different for the 2023-...,,A couple of things are different for the,The flu season of 2023-
137,2023-2024,Updates to U.S. Flu Surveillance Methods for t...,"Starting with the 2023-2024 influenza season, ...",Flu vaccination is often available at no or lo...,Flu vaccination is often available at no or lo...,Although monitoring influenza-only coded death...,"Starting with the 2023-2024 influenza season, ..."
138,2023-2024,B/Yamagata and Flu Vaccines Summary,Quadrivalent flu vaccines protect against four...,CDC is not involved in regulatory decision-mak...,Quadrivalent flu vaccines protect against four...,Quadrivalent flu vaccines protect against four...,All current flu vaccines in the United States ...
139,2023-2024,Coinfection: Getting More than One Respiratory...,It is possible to get sick with more than one ...,It is also possible to be sick with multiple f...,,It is also possible to be sick with multiple f...,It is possible to get sick with more than one ...


In [25]:
df.to_csv('/content/flu_data_summary_compare_abstractive.csv', index=False)