In [None]:
!unzip ./NIPS_archive.zip

Archive:  ./NIPS_archive.zip
  inflating: authors.csv             
  inflating: papers.csv              


In [None]:
!pip install pandas
!pip install datasets
!pip install transformers
!pip install rouge-score nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 7.6 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 85.3 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 92.3 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 86.0 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3

In [None]:
import datasets
from datasets import load_dataset
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [None]:
#make hugging face dataset from NIPS papers.csv
df = pd.read_csv('papers.csv', index_col ="title")

In [None]:
#get rid of all the papers that do NOT have an abstract
for ind in df.index:
  if pd.isnull(df.loc[ind, 'abstract']) or pd.isnull(df.loc[ind, 'full_text']):
    df.drop(ind, inplace = True)

In [None]:
df

Unnamed: 0_level_0,source_id,year,abstract,full_text
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Learning Generative Models with the Up Propagation Algorithm,5220,1997,Up-propagation is an algorithm for inverting ...,Learning Generative Models with the\n\nUp(cid:...
A Neural Network Based Head Tracking System,5221,1997,We have constructed an inexpensive video based...,A Neural Network Based\n\nHead Tracking System...
Algorithms for Non-negative Matrix Factorization,1861,2000,Non-negative matrix factorization (NMF) has pr...,Algorithms for Non-negative Matrix \n\nFactori...
Characterizing Neural Gain Control using Spike-triggered Covariance,1975,2001,Spike-triggered averaging techniques are effec...,Characterizing neural gain control using\n\nsp...
Compressed Regression,195,2007,Recent research has studied the role of sparsi...,Compressed Regression\n\nShuheng Zhou∗ John La...
...,...,...,...,...
Discrete Object Generation with Reversible Inductive Construction,5452,2019,The success of generative modeling in continuo...,Discrete Object Generation\n\nwith Reversible ...
Adaptively Aligned Image Captioning via Adaptive Attention Time,4799,2019,Recent neural models for image captioning usua...,Adaptively Aligned Image Captioning via\n\nAda...
Fully Dynamic Consistent Facility Location,1827,2019,We consider classic clustering problems in ful...,Fully Dynamic Consistent Facility Location\n\n...
Efficient Rematerialization for Deep Networks,8693,2019,"When training complex neural networks, memory ...",Efﬁcient Rematerialization for Deep Networks\n...


In [None]:
#split the dataframe into sets
train_set, test_set = train_test_split(df, test_size=0.2, random_state=0)


train = Dataset.from_pandas(train_set)
test = Dataset.from_pandas(test_set)

dataset_tuning = DatasetDict()
 
dataset_tuning['train'] = train
dataset_tuning['test'] = test

dataset_tuning

DatasetDict({
    train: Dataset({
        features: ['source_id', 'year', 'abstract', 'full_text', 'title'],
        num_rows: 5088
    })
    test: Dataset({
        features: ['source_id', 'year', 'abstract', 'full_text', 'title'],
        num_rows: 1272
    })
})

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 5 not upgraded.


In [None]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import create_optimizer, AdamWeightDecay
from transformers import AutoConfig
from transformers import T5Model
import nltk
import numpy as np
nltk.download('punkt')

metric = load_metric("rouge")
tokenizer = 0
prefix = "summarize: "
max_input_length = 1024
max_target_length = 128


#adds padding to input before traing the model on the dataset
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["full_text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["abstract"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



#tokenizer for the dataset
def my_tokenize(model_checkpoint):
  global tokenizer

  sum = dataset_tuning

  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  tokenized_sum = sum.map(preprocess_function, batched=True)

  return (tokenizer, tokenized_sum)


#create new summerization model
def get_model(model_checkpoint, tokenizer):
  #make a model that is not pre-trained
  model = AutoModelForSeq2SeqLM.from_pretrained("Dagar/t5-small-science-papers")

  data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
  return (model, data_collator)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}


#set hyper paramaters
#change hyper paramters for better trained model
def get_my_hyper_params(model_checkpoint, my_epochs, floating_point):
  batch_size = 16
  model_name = model_checkpoint
  args = Seq2SeqTrainingArguments(
      f"{model_name}-science-papers-NIPS",
      evaluation_strategy = "epoch",
      learning_rate=2e-5,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      weight_decay=0.01,
      save_total_limit=20,
      num_train_epochs=my_epochs,
      predict_with_generate=True,
      fp16=floating_point,
      push_to_hub=True,
  )

  return args

#make the trainer
def get_trainer(model, tokenizer, tokenized_sum, data_collator, training_args):
  trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_sum["train"],
    eval_dataset=tokenized_sum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
	)
  return trainer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  del sys.path[0]


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
def my_train_model():
  model_name = "t5-small" #this is used for the default tokenizer
  epochs = 10
  floating_point = True

  token_tuple = my_tokenize(model_name)

  model_tuple = get_model(model_name, token_tuple[0])

  params = get_my_hyper_params(model_name, epochs, floating_point)

  trainer = get_trainer(model_tuple[0], token_tuple[0], token_tuple[1], model_tuple[1], params)

  trainer.train()

  return trainer

In [None]:
trainer = my_train_model()

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/6 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


  0%|          | 0/2 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Cloning https://huggingface.co/Dagar/t5-small-science-papers-NIPS into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: year, full_text, abstract, title, source_id. If year, full_text, abstract, title, source_id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5088
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3180
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,5.185558,13.7172,2.0644,10.2189,12.838,19.0
2,5.452200,5.03834,15.6211,2.1808,11.3561,14.3054,19.0
3,5.452200,4.948597,15.1659,2.3308,11.1052,13.9456,19.0
4,5.125400,4.885084,15.716,2.4099,11.4954,14.5099,19.0
5,4.979400,4.845583,15.5507,2.4267,11.3867,14.3237,19.0
6,4.979400,4.807345,15.8406,2.4254,11.6878,14.6154,19.0
7,4.882300,4.787161,15.5554,2.4637,11.3401,14.3183,19.0
8,4.833800,4.768018,15.4783,2.4888,11.3364,14.2031,19.0
9,4.833800,4.762086,15.958,2.5662,11.6139,14.6576,19.0
10,4.783800,4.756572,15.7066,2.5654,11.4679,14.4017,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: year, full_text, abstract, title, source_id. If year, full_text, abstract, title, source_id are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1272
  Batch size = 16
Saving model checkpoint to t5-small-science-papers-NIPS/checkpoint-500
Configuration saved in t5-small-science-papers-NIPS/checkpoint-500/config.json
Model weights saved in t5-small-science-papers-NIPS/checkpoint-500/pytorch_model.bin
tokenizer config file saved in t5-small-science-papers-NIPS/checkpoint-500/tokenizer_config.json
Special tokens file saved in t5-small-science-papers-NIPS/checkpoint-500/special_tokens_map.json
tokenizer config file saved in t5-small-science-papers-NIPS/tokenizer_config.json
Special tokens file saved in t5-small-science-papers-NIPS/special_tokens_map.json
The fo

In [None]:
trainer.push_to_hub()

Saving model checkpoint to t5-small-science-papers-NIPS
Configuration saved in t5-small-science-papers-NIPS/config.json
Model weights saved in t5-small-science-papers-NIPS/pytorch_model.bin
tokenizer config file saved in t5-small-science-papers-NIPS/tokenizer_config.json
Special tokens file saved in t5-small-science-papers-NIPS/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/231M [00:00<?, ?B/s]

Upload file runs/Nov28_18-00-24_5ca121f841d6/events.out.tfevents.1669658434.5ca121f841d6.107.0:  30%|###      …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Dagar/t5-small-science-papers-NIPS
   5136336..9d1b2b8  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Dagar/t5-small-science-papers-NIPS
   5136336..9d1b2b8  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'metrics': [{'name': 'Rouge1', 'type': 'rouge', 'value': 15.7066}]}
To https://huggingface.co/Dagar/t5-small-science-papers-NIPS
   9d1b2b8..066d2d0  main -> main

   9d1b2b8..066d2d0  main -> main



'https://huggingface.co/Dagar/t5-small-science-papers-NIPS/commit/9d1b2b85e1e4e548f84423820d170e661fdc9675'