# Model Training

In [None]:
!pip install transformers datasets sacrebleu rouge_score torch accelerate torchvision -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m58.0 MB/s[0m 

In [None]:
from transformers import pipeline
from datasets import load_dataset
from transformers import AutoTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from google.colab import drive
import torch
import random

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
pre_trained_model = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(pre_trained_model)
model_bart = BartForConditionalGeneration.from_pretrained(pre_trained_model).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
papers_dataset = load_dataset("scientific_papers","arxiv")

Downloading builder script:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.99k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

In [None]:
papers_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract', 'section_names'],
        num_rows: 6440
    })
})

In [None]:
feature_to_remove = "section_names"

for split, dataset in papers_dataset.items():
    papers_dataset[split] = dataset.remove_columns(feature_to_remove)

In [None]:
papers_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [None]:
def truncate_dataset(dataset, fraction_to_keep):
    num_samples = len(dataset)
    num_samples_to_keep = int(num_samples * fraction_to_keep)
    indices_to_keep = random.sample(range(num_samples), num_samples_to_keep)
    truncated_dataset = dataset.select(indices_to_keep)
    return truncated_dataset

fraction_to_keep = 0.0005
for split in papers_dataset.keys():
    papers_dataset[split] = truncate_dataset(papers_dataset[split], fraction_to_keep)

In [None]:
papers_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 101
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 3
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 3
    })
})

In [None]:
# Removing Articles with blank text from train dataset
i = 0
indices_to_remove = []
while i < len(papers_dataset["train"]):
  if papers_dataset["train"][i]["article"] == "" or papers_dataset["train"][i]["abstract"] == "":
    indices_to_remove.append(i)
  i += 1

filtered_dataset = papers_dataset["train"].filter(lambda example, idx: idx not in indices_to_remove, with_indices=True)
papers_dataset["train"] = filtered_dataset

i = 0
indices_to_remove = []
while i < len(papers_dataset["validation"]):
  if papers_dataset["validation"][i]["article"] == "" or papers_dataset["validation"][i]["abstract"] == "":
    indices_to_remove.append(i)
  i += 1

filtered_dataset = papers_dataset["validation"].filter(lambda example, idx: idx not in indices_to_remove, with_indices=True)
papers_dataset["validation"] = filtered_dataset

print("Records with blank data removed")

Filter:   0%|          | 0/101 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3 [00:00<?, ? examples/s]

Records with blank data removed


In [None]:
papers_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 101
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 3
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 3
    })
})

In [None]:
def tokenize_function(examples):
    input_encodings = tokenizer(examples["article"], max_length=1024, padding="max_length", truncation=True, return_tensors="pt")

    with tokenizer.as_target_tokenizer():
      target_encodings = tokenizer(examples["abstract"],max_length=128, padding="max_length", truncation=True, return_tensors="pt")

    return {
        "input_ids" : input_encodings["input_ids"],
        "attention_mask" : input_encodings["attention_mask"],
        "labels" : target_encodings["input_ids"]
    }

papers_dataset_processed = papers_dataset.map(tokenize_function,batched=True)

Map:   0%|          | 0/101 [00:00<?, ? examples/s]



Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [None]:
papers_dataset_processed

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 101
    })
    validation: Dataset({
        features: ['article', 'abstract', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
    test: Dataset({
        features: ['article', 'abstract', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer,model=model_bart)

In [None]:
training_args = TrainingArguments(
    output_dir="./bart-papers-trained-output",
    per_device_train_batch_size=1,
    save_total_limit=2,
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=100,
    remove_unused_columns=True,  # Set to True to remove extra columns in the dataset
)

trainer = Trainer(
    model=model_bart,
    args=training_args,
    data_collator=data_collator,
    train_dataset=papers_dataset_processed["train"],
    eval_dataset=papers_dataset_processed["validation"],
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=101, training_loss=3.4285319465221744, metrics={'train_runtime': 3664.7568, 'train_samples_per_second': 0.028, 'train_steps_per_second': 0.028, 'total_flos': 218877564813312.0, 'train_loss': 3.4285319465221744, 'epoch': 1.0})

In [None]:
trainer.save_model("./bart-papers-trained-model")

In [None]:
!zip -r /content/model.zip /content/bart-papers-trained-model

  adding: content/bart-papers-trained-model/ (stored 0%)
  adding: content/bart-papers-trained-model/generation_config.json (deflated 48%)
  adding: content/bart-papers-trained-model/training_args.bin (deflated 51%)
  adding: content/bart-papers-trained-model/pytorch_model.bin (deflated 7%)
  adding: content/bart-papers-trained-model/config.json (deflated 61%)


In [None]:
from google.colab import files
files.download("model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>