In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
import numpy as np

In [2]:
# Prepare example data
examples = [
    "Find all users who are 25 years old",
    "Find products with price greater than $100",
    "Find active users from New York with age between 20 and 30"
]

responses = [
    'db.users.find({\n    age: 25\n})',
    'db.products.find({\n    price: { $gt: 100 }\n})',
    'db.users.find({\n    status: "active",\n    location: "New York",\n    age: {\n        $gte: 20,\n        $lte: 30\n    }\n})'
]

In [3]:
# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [6]:
# Create dataset
def create_prompt(example):
    return f"Generate NoSQL query for: {example}"

dataset = Dataset.from_dict({
    'prompt': [create_prompt(ex) for ex in examples],
    'response': responses
})

# Tokenization function
def tokenize_function(examples):
    prompts = examples['prompt']
    responses = examples['response']
    
    # Combine prompt and response
    combined = [f"{prompt}\n{response}" for prompt, response in zip(prompts, responses)]
    
    return tokenizer(
        combined,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="llama-nosql-trainer",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    logging_steps=10,
    save_steps=100,
    fp16=True
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

# Save model
model.save_pretrained("model-finetuned")
tokenizer.save_pretrained("tokenizer-finetuned")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds