In [1]:
# Cell 1: Install Dependencies
!pip install pandas torch datasets peft transformers trl wandb bitsandbytes scipy



In [None]:
# Cell 3: The Complete T4-Safe Training Pipeline
import os
import gc
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTConfig, SFTTrainer

# --- CONFIGURATION ---
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
OUTPUT_DIR = "mistral_nosql_final"
BATCH_SIZE = 1          # T4 Safe limit
GRAD_ACCUM = 16         # Simulates batch size 16
LR = 2e-5
N_EPOCHS = 3
SAVE_DIR = "output/" + OUTPUT_DIR

# --- 1. PREPARE DATA ---
print("üìä Processing Data...")
def generate_prompt(prompt, label=""):
    return f"<s>[INST] {prompt} [/INST] {label}".strip()

def generate_text(data_point):
    text = generate_prompt(data_point["Prompt"], label=data_point['target'])
    return {"text": text}

def process_dataset(data: Dataset):
    return data.shuffle(seed=42).map(generate_text).remove_columns(["Prompt", "target", "database", "query_id", "hardness", "gold_sql", "Query"])

# Load Data (Assuming files exist in data/)
train_df = pd.read_csv("data/spider_nosql_train.csv", sep=";", encoding="utf-8")
test_df = pd.read_csv("data/spider_nosql_dev.csv", sep=";", encoding="utf-8")
test_df = test_df.drop('gpt3.5 answer', axis=1, errors='ignore')

dataset_train = Dataset.from_pandas(train_df).train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({"train": dataset_train["train"], "eval": dataset_train["test"]})

dataset["train"] = process_dataset(dataset["train"])
dataset["eval"] = process_dataset(dataset["eval"])

# --- 2. LOAD MODEL (STRICT T4 SETTINGS) ---
print("üèóÔ∏è Loading Model (Float16 Compute)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Compute in Float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16, # Force base model to Float16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

# --- 3. APPLY LORA ---
print("üîß Applying LoRA...")
peft_config = LoraConfig(
    r=32, lora_alpha=32, lora_dropout=0.1,
    target_modules=["q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj", "v_proj"],
    bias="none", task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

# --- 4. SAFETY CAST (THE FIX) ---
print("üõ°Ô∏è Casting BFloat16 layers to Float32...")
count = 0
for name, param in model.named_parameters():
    if param.dtype == torch.bfloat16:
        param.data = param.data.to(torch.float32)
        count += 1
print(f"   Fixed {count} layers.")

# --- 5. TRAIN (AMP DISABLED) ---
print("üöÄ Starting Training...")

sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    dataset_text_field="text",
    max_length=2048,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    optim="paged_adamw_32bit",
    learning_rate=LR,
    num_train_epochs=N_EPOCHS,
    fp16=False,             # <--- CRITICAL: DISABLES THE CRASHING SCALER
    bf16=False,             # <--- CRITICAL: T4 CANNOT DO BF16
    logging_steps=1,
    save_strategy="epoch",
    report_to="none",       # No WandB
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["eval"],
    args=sft_config,
    processing_class=tokenizer,
)

trainer.train()
trainer.save_model()
print("‚úÖ DONE!")

üìä Processing Data...


Map:   0%|          | 0/1812 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

üèóÔ∏è Loading Model (Float16 Compute)...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

üîß Applying LoRA...
üõ°Ô∏è Casting BFloat16 layers to Float32...
   Fixed 0 layers.
üöÄ Starting Training...


Adding EOS to train dataset:   0%|          | 0/1812 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1812 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1812 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/202 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/202 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/202 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
1,1.7345
2,1.6061
3,1.321
4,1.2911
5,1.3131
6,1.3051
7,1.2164
8,1.2289
9,1.1622
10,1.0466
