In [None]:
pip install transformers datasets torch

In [21]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="data.json", split='train')

train_dataset = dataset.train_test_split(test_size=0.2)["train"]
eval_dataset = dataset.train_test_split(test_size=0.2)["test"]

In [None]:
from transformers import AutoTokenizer

# Load pre-trained tokenizer for T5 model
model_name = "t5-3b"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocessing function to tokenize input and output
def preprocess_function(examples):
    # Tokenize both input and output (decoder_input_ids)
    model_inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=512)

    model_inputs["labels"] = labels["input_ids"]  # Set the target labels for the decoder
    return model_inputs

# Apply the preprocessing function to both train and eval datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM
import torch
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
device_ids = [0, 1, 2, 3]

torch.cuda.set_device(device_ids[0])

if torch.cuda.device_count() > 0:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model, device_ids=device_ids)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,  # Enable mixed precision training
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    deepspeed="./ds_config.json",  # Optional: for advanced memory management
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

trainer.train()

In [None]:
import torch
# For inference (prediction) with T5 model
model.eval()
device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')
while True:
# Sample query (input)
  input_text = input("Enter your query: ")

  # Tokenize the input
  inputs = tokenizer(
      input_text, return_tensors="pt", padding=True, truncation=True, max_length=512,
      ).to(device)

  # Generate output from the model (T5 generates its own decoder_input_ids)
  outputs = model.generate(inputs['input_ids'], max_length=512,
                          num_beams=4, no_repeat_ngram_size=2, early_stopping=True)

  # Decode the output sequence (answer) into human-readable text
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

  print(generated_text)