In [None]:
!pip3 install transformers datasets==2.0.0 sentencepiece rouge_score sacrebleu 

In [None]:
!pip3 install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

In [3]:
DATA_CACHE_PATH = './cached_data'
MODEL_CACHE_PATH = './cached_models'
PROCESSED_PATH = './processed'

In [4]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import datasets
from datasets import load_dataset, load_metric, load_from_disk
import sentencepiece

from torch.utils.data import Dataset

from tqdm.auto import tqdm
from tqdm import tqdm

import pandas as pd
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


## Tokenize Data
- To improve runtime in the future, pre-tokenize the data and write it disk
- Only one batch is used for proper padding
- Since each model requires a unique tokenizer, each dataset and model needs its own file (3 datasets x 3 models = 9 files)

In [5]:
def tokenize(batch, tokenizer):
    """
    Tokenizes data. Assumes input is a column named 'description'
    """
    return tokenizer(
        batch["article"], 
        max_length=512, 
        padding="max_length", 
        truncation=True)

In [6]:
def write_tokenize(ckpt, ds_args):
    """
    Given the string for a pre-trained model on Hugging face with associated tokenizer and files,
    Get the model's tokenizer and use it to tokenize our dataset (ds_args), then save tokenized inputs to disk
    """
    nickname = ckpt.replace("/", "-").replace("_", "-")
    ds = load_dataset(**ds_args)
    
    #load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        ckpt, cache_dir=MODEL_CACHE_PATH)
    
    #tokenize data
    #alternative is to tokenize batches and use data collator
    df_encoded = ds.map(tokenize,
                        fn_kwargs={'tokenizer': tokenizer},
                        batched=True, 
                        batch_size=None, #Needs to pad everything in one batch 
                        load_from_cache_file=False)
    
    df_encoded.add_column('ground_truth', list(ds['abstract']))
    
    #write df to disk
    df_encoded.save_to_disk(f"./processed/{nickname}/{ds_args['path']}")
    
    return None

In [7]:
ckpts = [
    "google/bigbird-pegasus-large-bigpatent",
    "sshleifer/distill-pegasus-xsum-16-4",
    "sshleifer/distilbart-xsum-12-1",
    "t5-small"]


PUBMED_ARGS = {
    'path': 'scientific_papers',
    'name': "pubmed",
    'split': 'test'
}


# PATENT_ARGS = {
#     'path': 'big_patent',
#     'name': "h",
#     'split': 'test'
# }

# CNN_DAILYMAIL_ARGS = {
#     'path': 'ccdv/cnn_dailymail',
#     'name': "3.0.0",
#     'split': 'test'
# }

#CHANGE DS ARGS
#tokenize dataset for all 3 models
for ckpt in ckpts:
    write_tokenize(ckpt, PUBMED_ARGS)

Downloading builder script:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading and preparing dataset scientific_papers/pubmed (download: 4.20 GiB, generated: 2.33 GiB, post-processed: Unknown size, total: 6.53 GiB) to /root/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

Dataset scientific_papers downloaded and prepared to /root/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f. Subsequent calls will reuse this data.


Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Summarization

In [8]:
import string
import re

def clean(s):
    """
    Clean data for post-processing
    """
    a = s.translate(str.maketrans(' ', ' ', string.punctuation))
    return re.sub('\s+',' ', a).replace('\n', '').strip()

def chunks(list_of_elements, batch_size=5):
    """
    Yield successive batch-sized chunks from list_of_elements.
    Code adapted from NLP with transformers
    """
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def create_summaries(model_ckpt, total_articles, ds_args, batch_size=5):
    """
    Given the string for a pre-trained model on Hugging face with associated tokenizer and files,
    Get the model's config and weights and use it to summarize our dataset (ds_args), then save outputs inputs to disk
    """  
    nickname = model_ckpt.replace("/", "-").replace("_", "-")
    out_path = f"./processed/{nickname}/{ds_args['path']}"
    
    #Loads tokenized df if it's on the disk
    data = load_from_disk(out_path)

    #Change inputs to tensors
    if total_articles:
        ground_truths = list(data['abstract'])[:total_articles]
    if not total_articles:
        ground_truths = list(data['abstract'])
    
    data.set_format("torch", columns=["input_ids", "attention_mask"])
    torch.cuda.empty_cache()
    all_summaries = []
    
    #Load model based on supplied checkpoint (i.e. distilbart-cnn-dailymail)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt, cache_dir=MODEL_CACHE_PATH)
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt, cache_dir=MODEL_CACHE_PATH)
    model.to(device)
        
    #Chunk articles and iterate summarize by batch
    article_batches = list(chunks(data, batch_size=batch_size))
    for article_batch in tqdm(article_batches, total=len(article_batches)):

        with torch.no_grad():
            #Generate summaries. As discussed in paper, we played with many of these hyperparameters
            summaries = model.generate(
                input_ids=article_batch["input_ids"].to(device),
                attention_mask=article_batch["attention_mask"].to(device),
                length_penalty=2.5, 
                num_beams=8)

        #Decode into nice string
        for s in summaries:
            decoded_sum = tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            decoded_sum = decoded_sum.replace('\n', '').replace('<n>', '')
            all_summaries.append(decoded_sum)

    torch.cuda.empty_cache()
    del model 

    return {'summaries': all_summaries, 'ground_truth': ground_truths}

In [9]:
#CHANGE DS ARGS
for ckpt in ckpts:
    s = create_summaries(ckpt, total_articles=None, ds_args=PUBMED_ARGS, batch_size=5)
    output_df = pd.DataFrame(s).reset_index()
    output_df.to_csv(f"output_{ckpt.replace('/','')}.csv", index=False)

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

  2%|▏         | 21/1332 [00:47<49:33,  2.27s/it]


KeyboardInterrupt: ignored