In [None]:
# Install necessary libraries
!pip install transformers datasets nltk

In [None]:
# Import necessary libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import nltk
nltk.download('punkt')

In [None]:
# Load the pre-trained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [None]:
# Example of email dataset
# Replace with your own email dataset (this is just a placeholder)
emails = [
    {"email_body": "Thank you for attending the meeting. Please find attached the proposal."},
    {"email_body": "It was a pleasure meeting you. Attached are the necessary documents for the next steps."}
]

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["email_body"], padding="max_length", truncation=True, max_length=512)

In [None]:
# Create dataset
email_dataset = Dataset.from_dict({"email_body": [email["email_body"] for email in emails]})

In [None]:
# Tokenize the dataset
tokenized_emails = email_dataset.map(tokenize_function, batched=True)

In [None]:
# Prepare the dataset for T5 training by formatting it properly
def process_data_to_model_inputs(batch):
    inputs = batch["email_body"]
    batch["input_ids"] = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)["input_ids"]
    return batch

In [None]:
# Preprocess the dataset
tokenized_emails = tokenized_emails.map(process_data_to_model_inputs, batched=True)

In [None]:
# Set format for PyTorch
tokenized_emails.set_format(type="torch", columns=["input_ids"])

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_t5",          # Output directory to save the model
    per_device_train_batch_size=4,         # Batch size per device during training
    num_train_epochs=3,                    # Number of training epochs
    save_steps=500,                        # How often to save the model
    save_total_limit=2,                    # Limit the number of saved checkpoints
    logging_dir="./logs",                  # Directory to save logs
    logging_steps=100,
    evaluation_strategy="epoch",           # Evaluate after every epoch
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                           # The T5 model instance
    args=training_args,                    # Training arguments we defined above
    train_dataset=tokenized_emails,        # The tokenized email dataset
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

In [None]:
# Example of generating a new email template using the fine-tuned T5 model
def generate_email_t5(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    generated_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [None]:
# Generate a new email template with the fine-tuned T5 model
prompt = "Generate a follow-up email after a meeting regarding a project proposal."
generated_email = generate_email_t5(prompt)

print("T5 Generated Email:\n", generated_email)