In [None]:
# Installations
!pip install transformers
!pip install datasets
!pip install openpyxl

In [None]:
# Imports
from datasets import load_dataset
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration, T5Config, Trainer, TrainingArguments, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
import pandas as pd
import openpyxl
from transformers import AutoTokenizer
from csv import reader

In [None]:
# Loading the dataset
dataset_file = 'dataset.csv'
dataset = load_dataset('csv', data_files=dataset_file, split='train')
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
val_dataset = dataset['test']

In [None]:
# Tokenization
tokenizer = T5Tokenizer.from_pretrained('t5-base')

def tokenize(batch):
    tokenized_input = tokenizer(batch['source'], padding='max_length', truncation=True, max_length=900)
    tokenized_label = tokenizer(batch['target'], padding='max_length', truncation=True, max_length=900)
    tokenized_input['labels'] = tokenized_label['input_ids']
    return tokenized_input

dataset = dataset.map(tokenize, batched=True, batch_size=512)
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=512)
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

In [None]:
# Training
# load config
config = T5Config(decoder_start_token_id=tokenizer.convert_tokens_to_ids(['<pad>'])[0])
model = T5ForConditionalGeneration(config)
output_dir = 'output'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=15,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1,
    evaluation_strategy='steps', 
    remove_unused_columns=True, 
    run_name='run_name', 
    logging_steps=1000, 
    eval_steps=1000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
trainer.train()
trainer.save_model(output_dir + '/model')

In [None]:
# Evaluation
model_dir = 'output/model'
output_dir = 'output'
model = T5ForConditionalGeneration.from_pretrained(model_dir)

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred[0], axis=2)
    accuracy = accuracy_score(y_true=labels[0], y_pred=pred[0])
    recall = recall_score(y_true=labels[0], y_pred=pred[0], average='macro')
    precision = precision_score(y_true=labels[0], y_pred=pred[0], average='macro')
    f1 = f1_score(y_true=labels[0], y_pred=pred[0], average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

pred_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=1,
    remove_unused_columns=True,
    eval_accumulation_steps=1
)

trainer = Trainer(model=model, args=pred_args, compute_metrics=compute_metrics)

preds, labels, metrics = trainer.predict(val_dataset)
preds_tokens = preds[0].argmax(axis=2)
print(metrics)

decoded_sources = []
for row in val_dataset:
    decoded_sources.append(tokenizer.decode(row['input_ids']))

decoded_preds = [tokenizer.decode(pred) for pred in preds_tokens]
decoded_labels = [tokenizer.decode(label) for label in labels]

output = pd.DataFrame({'Source Text': decoded_sources, 'Target Text': decoded_labels, 'Generated Text': decoded_preds})
output.to_csv(output_dir + "/predictions.csv")

In [None]:
dataset_file = 'dev_dataset.csv'
dev_dataset = load_dataset('csv', data_files=dataset_file, split='train')

tokenizer = T5Tokenizer.from_pretrained('t5-base')

def tokenize(batch):
    tokenized_input = tokenizer(batch['source'], padding='max_length', truncation=True, max_length=900)
    tokenized_label = tokenizer(batch['target'], padding='max_length', truncation=True, max_length=900)
    tokenized_input['labels'] = tokenized_label['input_ids']
    return tokenized_input

dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=len(dev_dataset))

In [None]:
#Predict with beam search
model_dir = 'output/model'
output_dir = 'output'
model = T5ForConditionalGeneration.from_pretrained(model_dir)

pred_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=1,
    remove_unused_columns=True,
    eval_accumulation_steps=1,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(model=model, args=pred_args, tokenizer=AutoTokenizer.from_pretrained('t5-base'))

preds, labels, metrics = trainer.predict(dev_dataset, num_beams=3, max_length=900)
preds_tokens = preds

decoded_sources = []
for row in dev_dataset:
    decoded_sources.append(tokenizer.decode(row['input_ids']))

decoded_preds = [tokenizer.decode(pred) for pred in preds_tokens]
decoded_labels = [tokenizer.decode(label) for label in labels]

output = pd.DataFrame({'Source Text': decoded_sources, 'Target Text': decoded_labels, 'Generated Text': decoded_preds})
output.to_csv(output_dir + "/beam_predictions.csv")