# Import libraries

In [34]:
from datasets import load_dataset
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, 
                          Seq2SeqTrainingArguments, Seq2SeqTrainer)
import random
import torch

# Load Data

In [35]:
train_dataset = load_dataset("scientific_papers", "pubmed", split="train")

Found cached dataset scientific_papers (/home/u_51520750/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


In [36]:
val_dataset = load_dataset("scientific_papers", "pubmed", split="validation")

Found cached dataset scientific_papers (/home/u_51520750/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


In [37]:
test_dataset = load_dataset("scientific_papers", "pubmed", split="test")

Found cached dataset scientific_papers (/home/u_51520750/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


# Preprocess Data

## Load Tokenizer

In [43]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [44]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

## Set Params

In [45]:
max_input_length = 8192
max_output_length = 512
batch_size = 2

# Pre-processing Function

In [46]:
def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=max_input_length)
    outputs = tokenizer(batch["abstract"], padding="max_length", truncation=True, max_length=max_output_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["global_attention_mask"] = len(batch["input_ids"]) * [[0 for _ in range(len(batch["input_ids"][0]))]]
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

## Downsample

In [47]:
num_shards = 1000
raw_sub_train_dataset = train_dataset.shard(num_shards=num_shards, index=random.randint(0, num_shards - 1))
raw_sub_val_dataset = val_dataset.shard(num_shards=num_shards, index=random.randint(0, num_shards - 1))

## Tokenize and Convert to Torch

In [48]:
sub_train_dataset = raw_sub_train_dataset.map(process_data_to_model_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])
sub_val_dataset = raw_sub_val_dataset.map(process_data_to_model_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])

                                                             

In [49]:
sub_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"])
sub_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"])

# Model 1

In [50]:
led = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

## Train Model 1

In [51]:
training_args = Seq2SeqTrainingArguments(
    output_dir="allenai/led-base-16384_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="allenai/led-base-16384_logs",
    num_train_epochs=3,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=led,
    args=training_args,
    train_dataset=sub_train_dataset,
    eval_dataset=sub_val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,5.8705,5.495304
2,4.777,5.290465
3,4.3639,5.199156


TrainOutput(global_step=180, training_loss=5.003811815049914, metrics={'train_runtime': 41165.0255, 'train_samples_per_second': 0.009, 'train_steps_per_second': 0.004, 'total_flos': 1944147481067520.0, 'train_loss': 5.003811815049914, 'epoch': 3.0})

# Save Model 1

In [54]:
led.save_pretrained('model1_gpt2')


## Test Model 1

In [55]:
led.config.num_beams = 1
led.config.max_length = 512
led.config.min_length = 100
led.config.length_penalty = 2.0
led.config.early_stopping = True
led.config.no_repeat_ngram_size = 3

In [59]:
random_index = random.randint(0, len(sub_val_dataset) - 1)
sample = sub_val_dataset[random_index]

device = torch.device("cpu")
led.to(device)

input_ids = sample["input_ids"].unsqueeze(0).to(device)
attention_mask = sample["attention_mask"].unsqueeze(0).to(device)
global_attention_mask = sample["global_attention_mask"].unsqueeze(0).to(device)


In [60]:
# generate summary
with torch.no_grad():
    summary_ids = led.generate(input_ids=input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)


In [62]:
generated_summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
print("Generated Summary:")
print(generated_summary)

Generated Summary:
# background : a ( 4.8% versus 2.8i, 2. ne%, canada. 
 the a lumbar spine surgery at increased risk of postoperative complications, as defined by reoperation, within 3 months of the index procedure.  
 a ( a ( ) ) 35 undergoing elective lumb them spine surgery, cans.  Arsenal a ( the a (mi ) 35onal elective and unanticipated reoperation ( the rate of reoperation and non - obese subjects.  per the a bmi 35, corresponding to class ii and iii obesity as per the world health organization classification system.  det a ( 3% ), and type of procedure performed ( decompression, decompression with instrumented fusion, deformity correction, or arthroplasty ) other confounding factors, such as medical comorbidities and smoking, were not available through this database.  the a all patients would be in this dataset, unless they traveled out of the province for care.  a  captive patient population allows for powerful large group analyses. in the a was no ability from the billing re

In [69]:
actual_summary = raw_sub_val_dataset[random_index]["abstract"]
print("Actual Summary:")
print(actual_summary)

Actual Summary:
 study design :  population - based retrospective cohort study.clinical question :  are patients with a body mass index ( bmi ) of 35 or more who undergo elective lumbar spine surgery at increased risk of post - surgical complications , as evidenced by reoperation within a 3-month period?methods :  the alberta health and wellness administrative database was queried to identify patients who underwent elective lumbar spine surgery over a 24-month period . 
 this same database was used to classify subjects as obese ( bmi 35 ) and non - obese ( bmi < 35 ) and to determine who underwent repeated surgical intervention . 
 the rate of reoperation was determined for both the obese and non - obese groups ; further analyses were performed to determine whether certain subjects were at increased risk of reoperation.results :  the point estimate for relative risk for requiring reoperation was 1.73 ( 95% confidence interval , 1.032.90 ) for obese subjects compared with non - obese su

In [65]:
sample["labels"]

tensor([ 2050,  1486,  1058,   220,  3265,   532,  1912, 41432, 20812,  2050,
           13, 47367,  1808,  1058,   220,   389,  3871,   351,   257,  1767,
         2347,  6376,   357,   275, 11632,  1267,   286,  3439,   393,   517,
          508, 17777,  1742,   425,   300,  2178,   283, 19656,  8185,   379,
         3220,  2526,   286,  1281,   532, 21998, 19481,   837,   355, 30204,
          416,   302, 27184,  1626,   257,   513,    12,  8424,  2278,    30,
        24396,    82,  1058,   220,   262,   435,  4835,    64,  1535,   290,
        42506, 11553,  6831,   373, 42517,   798,   284,  5911,  3871,   508,
        25289,  1742,   425,   300,  2178,   283, 19656,  8185,   625,   257,
         1987,    12,  8424,  2278,   764,   220,   198,   428,   976,  6831,
          373,   973,   284, 36509,  7481,   355, 20779,   357,   275, 11632,
         3439,  1267,   290,  1729,   532, 20779,   357,   275, 11632,  1279,
         3439,  1267,   290,   284,  5004,   508, 25289,  5100, 

In [63]:
actual_summary = tokenizer.decode(sample["labels"], skip_special_tokens=True)

# Print and compare both summaries
print("\nActual Summary:")
print(actual_summary)

OverflowError: out of range integral type conversion attempted

In [None]:
generated_summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
actual_summary = tokenizer.decode(sample["labels"], skip_special_tokens=True)

# Print and compare both summaries
print("Generated Summary:")
print(generated_summary)
print("\nActual Summary:")
print(actual_summary)