In [59]:
import torch
import random
import numpy as np
from transformers import AutoTokenizer, AutoModelWithLMHead


In [60]:
from datasets import load_dataset

# Load data
train_dataset = load_dataset("scientific_papers", "pubmed", split="train")
val_dataset = load_dataset("scientific_papers", "pubmed", split="validation")
test_dataset = load_dataset("scientific_papers", "pubmed", split="test")


Found cached dataset scientific_papers (/home/u_51520750/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)
Found cached dataset scientific_papers (/home/u_51520750/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)
Found cached dataset scientific_papers (/home/u_51520750/.cache/huggingface/datasets/scientific_papers/pubmed/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


# downsample

In [63]:
num_shards = 1000
raw_sub_train_dataset = train_dataset.shard(num_shards=num_shards, index=random.randint(0, num_shards - 1))
raw_sub_val_dataset = val_dataset.shard(num_shards=num_shards, index=random.randint(0, num_shards - 1))

In [64]:
type(raw_sub_val_dataset)

datasets.arrow_dataset.Dataset

In [65]:
first_example = raw_sub_val_dataset[0]
#print(first_example)

# Initialize tokenizer and model

In [66]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# Model
model = AutoModelWithLMHead.from_pretrained("sshleifer/tiny-gpt2")


 # Preprocess

In [70]:
max_input_length = 8192
max_output_length = 512
batch_size = 2

In [71]:
def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=max_input_length)
    outputs = tokenizer(batch["abstract"], padding="max_length", truncation=True, max_length=max_output_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["global_attention_mask"] = len(batch["input_ids"]) * [[0 for _ in range(len(batch["input_ids"][0]))]]
    batch["global_attention_mask"][0][0] = 1
    batch["labels"] = outputs.input_ids
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [73]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [74]:
sub_train_dataset = raw_sub_train_dataset.map(process_data_to_model_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])
sub_val_dataset = raw_sub_val_dataset.map(process_data_to_model_inputs, batched=True, batch_size=batch_size, remove_columns=["article", "abstract", "section_names"])

                                                             

In [81]:
type(sub_train_dataset)
first_example = sub_train_dataset[0]
print(first_example)

{'input_ids': tensor([37047,   457, 13730,  ..., 50257, 50257, 50257]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'global_attention_mask': tensor([1, 0, 0,  ..., 0, 0, 0]), 'labels': tensor([15221,  4948,   457, 13730, 41899,   330,   291,  1221,   607,    77,
         3920,  1690,  4433,  6152, 21998,  3513,   284,  2948, 30232, 38495,
          290,  7748, 22729,   764,   220,   198, 32700, 10581,  2897,  1277,
        32704,   290,  1895,   284,   262,   607,  8461,   515,  1221,   290,
        32700,   288,  5330,   475,  2421,  1895, 38479,   290,   389,  1690,
         3917,   351, 11091,  1281, 27173,  2356,   290, 45105, 19481,   764,
          220,   198,   257, 21407,   351,  1262, 34319, 10581,   287,   262,
         4634,   286,  4318, 42302,  1431, 41899,   330,   291,  1221,   607,
           77,  3920,  2158,   837,   468,   587,   262,  3614, 32704,   286,
        32700,   288,  5330,   290,  8722,   284, 14351,  4659,   262,  6287,
          286, 38237,  2234

In [75]:
sub_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"])
sub_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "global_attention_mask", "labels"])

# Run Model

In [76]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer


In [83]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="allenai/led-base-16384_finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="allenai/led-base-16384_logs",
    num_train_epochs=3,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none",
)


In [84]:

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=sub_train_dataset,
    eval_dataset=sub_val_dataset,
    tokenizer=tokenizer,
)


In [86]:
# Train the model
trainer.train()


IndexError: index out of range in self