In [None]:
!pip install datasets transformers rouge-score nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.


In [None]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import create_optimizer, AdamWeightDecay
from transformers import AutoConfig
from transformers import T5Model
import nltk
import numpy as np
nltk.download('punkt')

metric = load_metric("rouge")
tokenizer = 0
prefix = "summarize: "
max_input_length = 1024
max_target_length = 128


#adds padding to input before traing the model on the dataset
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["abstract"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



#tokenizer for the dataset
def my_tokenize(model_checkpoint, dataset, subset):
  global tokenizer

  sum = load_dataset(dataset, subset)

  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  tokenized_sum = sum.map(preprocess_function, batched=True)

  return (tokenizer, tokenized_sum)


#create new summerization model
def get_model(model_checkpoint, tokenizer):
  #make a model that is not pre-trained
  config = AutoConfig.from_pretrained(model_checkpoint)
  
  model = AutoModelForSeq2SeqLM.from_config(config)
  model.init_weights()

  data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
  return (model, data_collator)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}


#set hyper paramaters
#change hyper paramters for better trained model
def get_my_hyper_params(model_checkpoint, my_epochs, floating_point):
  batch_size = 16
  model_name = model_checkpoint
  args = Seq2SeqTrainingArguments(
      f"{model_name}-science-papers",
      evaluation_strategy = "epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_total_limit=20,
      num_train_epochs=my_epochs,
      predict_with_generate=True,
      fp16=floating_point,
      push_to_hub=True,
  )

  return args

#make the trainer
def get_trainer(model, tokenizer, tokenized_sum, data_collator, training_args):
  trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_sum["train"],
    eval_dataset=tokenized_sum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
	)
  return trainer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def my_train_model():
  model_name = "t5-small"
  dataset = "scientific_papers"
  subset = "arxiv"
  epochs = 5
  floating_point = True

  token_tuple = my_tokenize(model_name, dataset, subset)

  model_tuple = get_model(model_name, token_tuple[0])

  params = get_my_hyper_params(model_name, epochs, floating_point)

  trainer = get_trainer(model_tuple[0], token_tuple[0], token_tuple[1], model_tuple[1], params)

  trainer.train()

  return trainer

In [None]:
trainer = my_train_model()

Downloading builder script:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.99k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading and preparing dataset scientific_papers/arxiv (download: 4.20 GiB, generated: 7.06 GiB, post-processed: Unknown size, total: 11.26 GiB) to /root/.cache/huggingface/datasets/scientific_papers/arxiv/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

Dataset scientific_papers downloaded and prepared to /root/.cache/huggingface/datasets/scientific_papers/arxiv/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/204 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

Cloning https://huggingface.co/Dagar/t5-small-science-papers into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: abstract, section_names, article. If abstract, section_names, article are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 203037
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 63450
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,4.4735,4.372655,9.9604,1.7641,8.6213,9.2779,19.0
2,4.0104,3.938435,11.4001,2.1474,9.6516,10.6602,19.0
3,3.8237,3.757975,11.1806,2.1229,9.3881,10.3853,19.0
4,3.7382,3.673797,11.9298,2.3222,9.9077,11.045,19.0


Saving model checkpoint to t5-small-science-papers/checkpoint-500
Configuration saved in t5-small-science-papers/checkpoint-500/config.json
Model weights saved in t5-small-science-papers/checkpoint-500/pytorch_model.bin
tokenizer config file saved in t5-small-science-papers/checkpoint-500/tokenizer_config.json
Special tokens file saved in t5-small-science-papers/checkpoint-500/special_tokens_map.json
tokenizer config file saved in t5-small-science-papers/tokenizer_config.json
Special tokens file saved in t5-small-science-papers/special_tokens_map.json
Saving model checkpoint to t5-small-science-papers/checkpoint-1000
Configuration saved in t5-small-science-papers/checkpoint-1000/config.json
Model weights saved in t5-small-science-papers/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in t5-small-science-papers/checkpoint-1000/tokenizer_config.json
Special tokens file saved in t5-small-science-papers/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to t5-sma

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,4.4735,4.372655,9.9604,1.7641,8.6213,9.2779,19.0
2,4.0104,3.938435,11.4001,2.1474,9.6516,10.6602,19.0
3,3.8237,3.757975,11.1806,2.1229,9.3881,10.3853,19.0
4,3.7382,3.673797,11.9298,2.3222,9.9077,11.045,19.0
5,3.6994,3.64055,12.3568,2.4449,10.2371,11.4209,19.0


Saving model checkpoint to t5-small-science-papers/checkpoint-56000
Configuration saved in t5-small-science-papers/checkpoint-56000/config.json
Model weights saved in t5-small-science-papers/checkpoint-56000/pytorch_model.bin
tokenizer config file saved in t5-small-science-papers/checkpoint-56000/tokenizer_config.json
Special tokens file saved in t5-small-science-papers/checkpoint-56000/special_tokens_map.json
Deleting older checkpoint [t5-small-science-papers/checkpoint-46000] due to args.save_total_limit
Saving model checkpoint to t5-small-science-papers/checkpoint-56500
Configuration saved in t5-small-science-papers/checkpoint-56500/config.json
Model weights saved in t5-small-science-papers/checkpoint-56500/pytorch_model.bin
tokenizer config file saved in t5-small-science-papers/checkpoint-56500/tokenizer_config.json
Special tokens file saved in t5-small-science-papers/checkpoint-56500/special_tokens_map.json
Deleting older checkpoint [t5-small-science-papers/checkpoint-46500] due t

In [None]:
trainer.push_to_hub()

Saving model checkpoint to t5-small-science-papers
Configuration saved in t5-small-science-papers/config.json
Model weights saved in t5-small-science-papers/pytorch_model.bin
tokenizer config file saved in t5-small-science-papers/tokenizer_config.json
Special tokens file saved in t5-small-science-papers/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/231M [00:00<?, ?B/s]

Upload file runs/Nov03_10-56-39_4b992b777c21/events.out.tfevents.1667473009.4b992b777c21.105.0:  12%|#2       …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Dagar/t5-small-science-papers
   f0f5873..c704b09  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Dagar/t5-small-science-papers
   f0f5873..c704b09  main -> main

To https://huggingface.co/Dagar/t5-small-science-papers
   c704b09..f9d70bb  main -> main

   c704b09..f9d70bb  main -> main



'https://huggingface.co/Dagar/t5-small-science-papers/commit/c704b0962e7920e77f044fc2cbe2dd549cd88e3a'

In [None]:
!zip -r /content/t5-small-science-papers.zip /content/t5-small-science-papers

  adding: content/t5-small-science-papers/ (stored 0%)
  adding: content/t5-small-science-papers/checkpoint-59000/ (stored 0%)
  adding: content/t5-small-science-papers/checkpoint-59000/scaler.pt (deflated 55%)
  adding: content/t5-small-science-papers/checkpoint-59000/training_args.bin (deflated 47%)
  adding: content/t5-small-science-papers/checkpoint-59000/scheduler.pt (deflated 49%)
  adding: content/t5-small-science-papers/checkpoint-59000/tokenizer_config.json (deflated 83%)
  adding: content/t5-small-science-papers/checkpoint-59000/pytorch_model.bin (deflated 7%)
  adding: content/t5-small-science-papers/checkpoint-59000/tokenizer.json (deflated 74%)
  adding: content/t5-small-science-papers/checkpoint-59000/config.json (deflated 62%)
  adding: content/t5-small-science-papers/checkpoint-59000/optimizer.pt (deflated 9%)
  adding: content/t5-small-science-papers/checkpoint-59000/trainer_state.json (deflated 82%)
  adding: content/t5-small-science-papers/checkpoint-59000/special_to

In [None]:
from google.colab import files
files.download('/content/t5-small-science-papers.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>