In [None]:
# Installations
!pip install transformers
!pip install datasets
!pip install openpyxl

In [None]:
# Imports
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
import pandas as pd
import openpyxl
import csv
from csv import reader

In [None]:
# Loading the dataset
dataset_file = 'dataset.csv'
dataset = load_dataset('csv', data_files=dataset_file, split='train')
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

In [None]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained('t5-base')

def tokenize(batch):
    tokenized_input = tokenizer(batch['source'], padding='max_length', truncation=True, max_length=900)
    tokenized_label = tokenizer(batch['target'], padding='max_length', truncation=True, max_length=900)
    tokenized_input['labels'] = tokenized_label['input_ids']
    return tokenized_input

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=512)
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

In [None]:
# Training
model = T5ForConditionalGeneration.from_pretrained('t5-base')
output_dir = 'output'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1, 
    learning_rate=0.001,
    evaluation_strategy='steps', 
    remove_unused_columns=True, 
    run_name='run_name', 
    logging_steps=1000, 
    eval_steps=1000,
    adam_beta1=0.6,
    adam_beta2=0.6,
    adam_epsilon=1.3e-8 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
trainer.save_model(output_dir + '/model')

In [None]:
# Evaluation
model_dir = 'output/model'
output_dir = 'output'
model = T5ForConditionalGeneration.from_pretrained(model_dir)

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred[0], axis=2)
    accuracy = accuracy_score(y_true=labels[0], y_pred=pred[0])
    recall = recall_score(y_true=labels[0], y_pred=pred[0], average='macro')
    precision = precision_score(y_true=labels[0], y_pred=pred[0], average='macro')
    f1 = f1_score(y_true=labels[0], y_pred=pred[0], average='macro')
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

pred_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=1,
    remove_unused_columns=True,
    eval_accumulation_steps=1
)

trainer = Trainer(model=model, args=pred_args, compute_metrics=compute_metrics)

preds, labels, metrics = trainer.predict(val_dataset)
preds_tokens = preds[0].argmax(axis=2)
print(metrics)

decoded_sources = []
for row in val_dataset:
    decoded_sources.append(tokenizer.decode(row['input_ids']))

decoded_preds = [tokenizer.decode(pred) for pred in preds_tokens]
decoded_labels = [tokenizer.decode(label) for label in labels]

output = pd.DataFrame({'Source Text': decoded_sources, 'Target Text': decoded_labels, 'Generated Text': decoded_preds})
output.to_excel(output_dir + "/predictions.xlsx")


In [None]:
# Generating with beam search
model_dir = 'output/model'
model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained('t5-base')

dev_outputs = []
with open('dev_dataset.csv', 'r') as read_obj:
  csv_reader = reader(read_obj)
  header = next(csv_reader)
  if header != None:
    for row in csv_reader:
      source = row[0]
      target = row[1]
      input_ids = tokenizer(source, return_tensors="pt", padding='max_length', truncation=True, max_length=900).input_ids
      output = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=1, max_length=900)
      generated = tokenizer.batch_decode(output, skip_special_tokens=True)
      dev_outputs.append([source, target, generated[0]])

In [None]:
header = ['source','target','generated']
data = [[source, target, generated] for [source, target, generated] in dev_outputs]
with open(r'dev_outputs.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(data)