# Iterative Model Trainer



## Training Progress

100 Iterations Completed

In [None]:
!pip install transformers datasets sacrebleu rouge_score torch accelerate torchvision -q -U

In [None]:
from transformers import pipeline
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, BartForConditionalGeneration, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from google.colab import drive
import torch
import random
import pandas as pd

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
# for first time
# model_path = "facebook/bart-large-cnn"

# for nth iteration
model_path = "./drive/MyDrive/Submission/BE Project Group No 31/Model/iter_trained_model"

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained(model_path).to(device)

In [None]:
i = 101

train_df = pd.DataFrame()
val_df = pd.DataFrame()

for current_data_iteration in range(i,i+2):
  train_dataset_path = f"./drive/MyDrive/Submission/BE Project Group No 31/Data/training_data{current_data_iteration}.csv"
  val_dataset_path = f"./drive/MyDrive/Submission/BE Project Group No 31/Data/val_data{current_data_iteration}.csv"
  train_df = pd.concat([train_df, pd.read_csv(train_dataset_path)])
  val_df = pd.concat([val_df, pd.read_csv(val_dataset_path)])


train_dataset = Dataset.from_pandas( train_df )
val_dataset = Dataset.from_pandas( val_df )
papers_dataset = DatasetDict({"train":train_dataset, "validation":val_dataset})

In [None]:
def tokenize_function(examples):
    input_encodings = tokenizer(examples["article"], max_length=1024, padding="max_length", truncation=True, return_tensors="pt")

    with tokenizer.as_target_tokenizer():
      target_encodings = tokenizer(examples["abstract"],max_length=128, padding="max_length", truncation=True, return_tensors="pt")

    return {
        "input_ids" : input_encodings["input_ids"],
        "attention_mask" : input_encodings["attention_mask"],
        "labels" : target_encodings["input_ids"]
    }

papers_dataset_processed = papers_dataset.map(tokenize_function,batched=True)

In [None]:
papers_dataset_processed

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer,model=model)

In [None]:
training_args = TrainingArguments(
    output_dir="./bart-papers-trained-output",
    per_device_train_batch_size=1,
    save_total_limit=2,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=250,
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=100,
    remove_unused_columns=True,  # Set to True to remove extra columns in the dataset
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=papers_dataset_processed["train"],
    eval_dataset=papers_dataset_processed["validation"],
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


In [None]:
trainer.save_model("./drive/MyDrive/Submission/BE Project Group No 31/Model/iter_trained_model")

In [None]:
print(f"{current_data_iteration}th Iteration Training Completed")