# Notebook for fine-tunning a T5 small model that will be able to convert Natural Language into mongodb queries


## **1. Setup**
Install HuggingFace transformers and dataset libs



In [None]:
!pip install -q transformers datasets sentencepiece accelerate huggingface_hub fsspec==2025.3.2

## **2. Hugging Face Login**
It will require access token, more details -> https://huggingface.co/docs/hub/en/security-tokens


In [None]:
from huggingface_hub import notebook_login
from google.colab import userdata

print("Log in to Hugging Face Hub to upload the model (required to upload model during training)")

hf_token=userdata.get('HF_TOKEN')
notebook_login(hf_token)

## **3. Imports**



In [None]:
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import json

print(f"Using torch version: {torch.__version__} and cuda version {torch.version.cuda}")

## **4. Setup the device agnotic code**

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    !nvidia-smi
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU.")

## **5 Mount GDrive**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
base_dir = '/content/drive/MyDrive/NL2MQL'
training_data_json = 'nlq2mlq_find_op_data_1138_linearized.json'
database_schema_txt = 'database_schema.txt'

## **6. Define path and load training data**
The training data contains pairs for inputs (nlq) and targets (mlq). The tokenizer has some issues with mongodb query nested structures (together with curly brackets {}, $, escaped quotes). During inference instead of curly brackets it was generating <unk> token. Therefore the targets have to be defined in a linear form.
E.g.

`{ "c": "users", "op": "find", "q": { "country": "Canada" } }`

is translated to

`c=users; op=find; q=LCB country=Canada RCB`

In [None]:
json_file_path = f"{base_dir}/{training_data_json}"
print(f"Loading the training data from {json_file_path}")

with open(json_file_path, 'r') as f:
  training_data = json.load(f)
print(f"Successfully loaded {len(training_data)} examples from {json_file_path}")
print(f"Let's check first two examples: {training_data[:2]}")

## **7. Define and load DB schema**

In [None]:
database_schema_path = f"{base_dir}/{database_schema_txt}"
print(f"Loading database schema from {database_schema_path}")

with open(database_schema_path, 'r') as f:
  database_schema = f.read()
print(f"Successfully loaded database schema from {database_schema_path}")
print(f"Let's check first 10 lines: {database_schema.strip().splitlines()[:10]}")

## **8. Prepare data for T5**



In [None]:
prefix = "translate Natural Language to MongoDB Query: "
inputs = []
targets = []

for example in training_data:
    nlq = example['nlq']
    mlq = example['mlq']

    # Combine schema and NLQ for the input
    input_text = f"{prefix}Schema: {database_schema.strip()} NLQ: {nlq.strip()}"
    inputs.append(input_text)
    targets.append(mlq.strip())
print(f"Training data consists of {len(inputs)} inputs and {len(targets)} targets")

## **9. Preprocess data to be compatible with HF dataset**




In [None]:
data_dict = {"input_text": inputs, "target_text": targets}
raw_dataset = Dataset.from_dict(data_dict)
print(f"The dataset has {raw_dataset.num_rows} examples")

## **10. Split data into training - validation data 90% - 10%**


In [None]:
train_test_split = raw_dataset.train_test_split(test_size=0.1, seed=42) #Fixed seed for testing
#train_test_split = raw_dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})
print(f"Training data has {len(dataset_dict['train'])} data examples and validation data has {len(dataset_dict['validation'])}")
print("\nPrepared input data one example:")
print(dataset_dict['train'][0]['input_text'])
print("\Prepared target data one example:")
print(dataset_dict['train'][0]['target_text'])
print(f"\nPrepared dataset:")
print(dataset_dict)

## **11. Setup tokenization**

In [None]:
from transformers import T5TokenizerFast

model_checkpoint = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint)
print(f"Tokenizer for '{model_checkpoint}' loaded successfully.")

max_input_length = 1024  # Combining db schema + NLQ
max_target_length = 256  # MLQ lenggth

def tokenize_function(samples):
    """Applies the tokenizer to a batch of samples."""
    model_inputs = tokenizer(
        samples["input_text"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        text_target=samples["target_text"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


## **12. Apply Tokenization**



In [None]:
tokenized_datasets = dataset_dict.map(
    tokenize_function,
    batched=True,
    remove_columns=["input_text", "target_text"]
)
print("Tokenization complete.")
print("\nTokenized dataset structure:")
print(tokenized_datasets)
print("\nKeys in one tokenized training example:")
print(tokenized_datasets["train"][:1].keys())

## **13. Initialize T5 model and Seq2Seq trainer**
https://huggingface.co/docs/transformers/en/model_doc/t5?usage=AutoModel#t5

In [None]:
import datetime
from google.colab import userdata

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
print(f"\nModel '{model_checkpoint}' loaded successfully and moved to {device}.")
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
hf_username=userdata.get('HF_USERNAME')
base_model_name = model_checkpoint.split('/')[-1]
hub_model_id = f"{hf_username}/{hf_username}-nl-query-2-mongo-query-{base_model_name}"
print(f"Model will be pushed to Hugging Face Hub repo: {hub_model_id}")
output_directory = f"{model_checkpoint.split('/')[-1]}-nl2mongo-local-{timestamp}"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_directory,
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="end",
    num_train_epochs=15,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
)

print(f"\nTraining arguments defined and set to upload to Hub")
print(f"Mixed precision (fp16) enabled: {training_args.fp16}")

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",
    label_pad_token_id=tokenizer.pad_token_id
)
print("\nData collator initialized.")

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator
)

print("\nSeq2SeqTrainer initialized successfully and ready for training and uploading to Hub.")




## **14. Start fine-tuning**

In [None]:
print("Starting the fine-tuning process...")
print(f"Model checkpoints and logs will be saved in: {training_args.output_dir}")

train_result = trainer.train()
print("\nFine-tuning finished!")

metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

print("\nSaving the final model and tokenizer...")
trainer.save_model()
trainer.save_state()
print(f"Final model saved to {training_args.output_dir}")

## **15. Inference**

In [None]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration

print(f"Assuming T5/FlanT5 model type for loading.")
tokenizer = T5TokenizerFast.from_pretrained(output_directory)
model = T5ForConditionalGeneration.from_pretrained(output_directory)

model.to(device)
model.eval()
print("Model and tokenizer loaded successfully.")

test_nlqs = [
    "Get pending bookings lasting > 1 night",
    "Find failed payment bookings with price > $500",
    "Find users from France",
    "Get me the bookings that are cancelled",
    "Get me all users that have more than 10 cancelled bookings",
    "Which users are from Germany and UK and have > 10 bookings so far",
    "Get all users from Mexico and Spain"
]
print(f"\nTesting with {len(test_nlqs)} examples...")
print("\n" + "="*80)
local_predictions = []
for nlq in test_nlqs:
    print(f"NLQ: {nlq}")
    input_text = f"{prefix}Schema: {database_schema.strip()} NLQ: {nlq.strip()}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    try:
        with torch.no_grad(): # Inference mode
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=256, # Max target length for linearized format
                num_beams=5,
                early_stopping=True,
            )
        # Decode the raw generated string
        generated_linear_mlq = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated Linear MLQ (Local): {generated_linear_mlq}")
    except Exception as e:
        print(f"Error during generation for NLQ '{nlq}': {e}")
        local_predictions.append({"nlq": nlq, "predicted_linear_mlq": f"GENERATION ERROR: {e}", "parsed_output": None})
    print("-"*80)

print("Local testing finished.")
