In [32]:
# Uninstall conflicting packages
#!pip uninstall -y transformers trl peft datasets accelerate bitsandbytes huggingface_hub
# Install compatible versions
!pip install torch==2.4.1 transformers==4.44.2 peft==0.12.0 datasets==2.21.0 trl==0.8.6 accelerate==1.0.0 bitsandbytes==0.44.1 huggingface_hub==0.25.2

Collecting torch==2.4.1
  Downloading torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.12.0
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting trl==0.8.6
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting accelerate==1.0.0
  Downloading accelerate-1.0.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes==0.44.1
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting huggingface_hub==0.25.2
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.m

In [None]:
import os
import torch
import json
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
from transformers import TrainingArguments
from huggingface_hub import login

# Authenticate with Hugging Face
login(token="")  # Replace with your token from https://huggingface.co/settings/tokens

# Configuration
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
DATASET_PATH = "raft_qa_dataset_cleaned.json"  # Or "raft_qa_dataset.json"
OUTPUT_DIR = "./lora_finetuned_model"
ADAPTERS_DIR = "./lora_adapters"
MERGED_MODEL_DIR = "./merged_finetuned_model"
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# Load and format dataset
def format_example(example):
    return {
        "text": f"### Instruction: Answer the following question about an insurance policy.\n### Question: {example['question']}\n### Answer: {example['answer']} ###"
    }

print("Loading dataset...")
dataset = load_dataset("json", data_files=DATASET_PATH)
dataset = dataset.map(format_example)
train_dataset = dataset["train"]
train_dataset = train_dataset.train_test_split(test_size=0.2, seed=42)
train_data = train_dataset["train"]
val_data = train_dataset["test"]

# Save formatted datasets
train_data.to_json("train_dataset.jsonl")
val_data.to_json("val_dataset.jsonl")

# Load model and tokenizer
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

print("Applying LoRA configuration...")
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    max_grad_norm=0.3,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    logging_steps=10,
    fp16=True,
    optim="adamw_torch",
    report_to="none"
)

# Initialize trainer
print("Initializing trainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    dataset_text_field="text",
    max_seq_length=512,
    args=training_args
)

# Train
print("Starting training...")
trainer.train()

# Save LoRA adapters
print("Saving LoRA adapters...")
model.save_pretrained(ADAPTERS_DIR)
tokenizer.save_pretrained(ADAPTERS_DIR)

# Merge LoRA adapters with base model
print("Merging LoRA adapters with model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
merged_model = PeftModel.from_pretrained(base_model, ADAPTERS_DIR)
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained(MERGED_MODEL_DIR)
tokenizer.save_pretrained(MERGED_MODEL_DIR)

# Test the merged model
def generate_text(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(merged_model.device)
    outputs = merged_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.6,
        top_p=0.95,
        top_k=50
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


print(f"Fine-tuning complete. Model saved to {MERGED_MODEL_DIR}")

print("Testing fine-tuned model...")
prompt = """### Question: What are the eligibility requirements for the Basic plan? ###
Answer: """
output = generate_text(prompt)
print("output1: ",output)

prompt = """### Question: What is the maximum drug claim for a person in a calender year? ###
Answer: """
output = generate_text(prompt)
print("output2: ",output)

prompt = """### Question: What is the maximum drug coverage for a person in a calender year? ###
Answer: """
output = generate_text(prompt)
print("output3: ",output)

prompt = """### Question: What is required to pay for emergency air ambulance servicer? ###
Answer: """
output = generate_text(prompt)
print("output4: ",output)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Loading dataset...


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Loading model and tokenizer...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Applying LoRA configuration...
trainable params: 18,464,768 || all params: 1,795,552,768 || trainable%: 1.0284
Initializing trainer...


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]



Starting training...


Step,Training Loss,Validation Loss


Saving LoRA adapters...
Merging LoRA adapters with model...


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Fine-tuning complete. Model saved to ./merged_finetuned_model
Testing fine-tuned model...


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output1:  ### Question: What are the eligibility requirements for the Basic plan? ###
Answer:  # of users per month, 0.10 per user, 0.25% of users per month, 0.00% of users per month.
</think>

### Eligibility Requirements for the Basic Plan

The eligibility requirements for the Basic plan are as follows:

- **Number of Users per Month:** The Basic plan does not specify a limit on the number of users per month. Users can be accessed as long as they are within the scope of the plan's provisions.




Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output2:  ### Question: What is the maximum drug claim for a person in a calender year? ###
Answer:  # of claims in a year
### Answer:  # of claims in a year
### Answer:  # of claims in a year
### Answer:  # of claims in a year
### Answer: # of claims in a year

Wait, that's not making sense. It's just repeating the same answer over and over. I need to figure out a way to present this information correctly.

I think the question is asking, "What is the maximum drug claim for a person in


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


output3:  ### Question: What is the maximum drug coverage for a person in a calender year? ###
Answer: 500 mg per day for 30 days, totaling 15,000 mg. 500 mg per day for 25 days, totaling 12,500 mg. 500 mg per day for 20 days, totaling 10,000 mg. 500 mg per day for 15 days, totaling 7,500 mg. 500 mg per day for 1
output4:  ### Question: What is required to pay for emergency air ambulance servicer? ###
Answer:  # of emergency ambulance services required to be paid for by the emergency air ambulance servicer. # of emergency ambulance services required to be paid for by the emergency air ambulance servicer. # of emergency ambulance services required to be paid for by the emergency air ambulance servicer. # of emergency ambulance services required to be paid for by the emergency air ambulance servicer.
</think>

To address your query regarding the payment for emergency air ambulance services, here's a structured breakdown:

### **Question:** What is required


# LoRA-FA

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
import torch
import os

# === Configuration ===
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
DATASET_PATH = "raft_qa_dataset_cleaned.json"
OUTPUT_DIR = "./lora_finetuned_model"
ADAPTERS_DIR = "./lora_adapters"
MERGED_MODEL_DIR = "./merged_finetuned_model"
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# === Dataset Formatting ===
def format_example(example):
    return {
        "text": f"### Instruction: Answer the following question about an insurance policy.\n### Question: {example['question']}\n### Answer: {example['answer']} ###"
    }

print("Loading and formatting dataset...")
dataset = load_dataset("json", data_files=DATASET_PATH)
dataset = dataset.map(format_example)
split = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_data, val_data = split["train"], split["test"]

train_data.to_json("train_dataset.jsonl")
val_data.to_json("val_dataset.jsonl")

# === Load Tokenizer & Model with QLoRA config ===
print("Loading model and tokenizer...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    quantization_config=bnb_config
)

# === Apply LoRA ===
print("Applying LoRA configuration...")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# === LoRA-FA: Freeze lora_A ===
for name, module in model.named_modules():
    if hasattr(module, "lora_A"):
        module.lora_A.requires_grad_(False)
        #print(f"Froze lora_A in {name}")

# === Training Arguments ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    max_grad_norm=0.3,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    logging_steps=10,
    fp16=True,
    optim="adamw_torch",
    report_to="none"
)

# === SFT Trainer ===
print("Starting training with LoRA-FA...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    dataset_text_field="text",
    max_seq_length=512,
    args=training_args
)

trainer.train()

# === Save Adapters ===
print("Saving LoRA-FA adapters...")
model.save_pretrained(ADAPTERS_DIR)
tokenizer.save_pretrained(ADAPTERS_DIR)

# === Merge & Save Full Model ===
print("Merging LoRA-FA adapters with base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

merged_model = PeftModel.from_pretrained(base_model, ADAPTERS_DIR)
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained(MERGED_MODEL_DIR)
tokenizer.save_pretrained(MERGED_MODEL_DIR)

print("✅ LoRA-FA training and merging complete.")


Loading and formatting dataset...


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Loading model and tokenizer...
Applying LoRA configuration...
Starting training with LoRA-FA...


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss


Saving LoRA-FA adapters...
Merging LoRA-FA adapters with base model...




Saving checkpoint shards:   0%|          | 0/1 [00:00<?, ?it/s]

✅ LoRA-FA training and merging complete.


In [6]:
def generate_text(prompt, max_new_tokens=100, sample=False):
    merged_model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(merged_model.device)

    # Force float32 on MPS for numerical stability
    if torch.backends.mps.is_available():
        merged_model.to(torch.float32)
        inputs = {k: v.to(torch.float32) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = merged_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=sample,                # Set False to test deterministically
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("🚀 Testing fine-tuned RAFT model...\n")

for i, prompt in enumerate(test_prompts, 1):
    try:
        output = generate_text(prompt, sample=False)  # safer deterministic generation
        print(f"Output {i}:\n{output}\n{'-'*60}")
    except RuntimeError as e:
        print(f"❌ Error on prompt {i}: {e}")


🚀 Testing fine-tuned RAFT model...





Output 1:
### Question: What are the eligibility requirements for the Basic plan? ###
Answer: The eligibility requirements for the Basic plan are as follows: 1. The user must be a full-time student with a minimum of 12 months of study and work experience. 2. The user must be a citizen of China. 3. The user must be a resident of China. 4. The user must be a citizen of China. 5. The user must be a citizen of China. 6. The user must be a citizen of China. 7. The
------------------------------------------------------------
Output 2:
### Question: What is the maximum drug claim for a person in a calendar year? ###
Answer: The maximum drug claim for a person in a calendar year is $100,000. This is based on the fact that the drug is not covered under any insurance policy, and the claim is not recoverable. The claim is recoverable only if the drug is covered under insurance. The maximum drug claim is $100,000. This is based on the fact that the drug is not covered under any insurance policy, a

In [5]:
# Prompts to test
test_prompts = [
    "### Question: What are the eligibility requirements for the Basic plan? ###\nAnswer:",
    "### Question: What is the maximum drug claim for a person in a calendar year? ###\nAnswer:",
    "### Question: What is the maximum drug coverage for a person in a calendar year? ###\nAnswer:",
    "### Question: What is required to pay for emergency air ambulance services? ###\nAnswer:"
]

# LORA+

In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from torch.optim import AdamW
import os
os.environ["ACCELERATE_DISABLE_MIXED_PRECISION"] = "true"


# === Config ===
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
DATASET_PATH = "raft_qa_dataset_cleaned.json"
OUTPUT_DIR = "./lora_plus_output"
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# === Format dataset ===
def format_example(example):
    return {
        "text": f"### Instruction: Answer the following question about an insurance policy.\n### Question: {example['question']}\n### Answer: {example['answer']} ###"
    }

dataset = load_dataset("json", data_files=DATASET_PATH)
dataset = dataset.map(format_example)
train_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_data = train_dataset["train"]
val_data = train_dataset["test"]

# === BitsAndBytes quantization config ===
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

# === Load tokenizer and model ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# === LoRA+ configuration ===
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ]
)
model = get_peft_model(model, lora_config)

# === Build custom LoRA+ optimizer ===
lora_A_params, lora_B_params = [], []

for name, module in model.named_modules():
    if hasattr(module, "lora_A") and hasattr(module, "lora_B"):
        lora_A_params.extend([p for p in module.lora_A.parameters() if p.requires_grad])
        lora_B_params.extend([p for p in module.lora_B.parameters() if p.requires_grad])

print(f"LoRA A params: {len(lora_A_params)}")
print(f"LoRA B params: {len(lora_B_params)}")
model.print_trainable_parameters()
model.enable_input_require_grads()

base_lr = 2e-4
B_LR_MULTIPLIER = 16
optimizer = AdamW([
    {"params": lora_A_params, "lr": base_lr},
    {"params": lora_B_params, "lr": base_lr * B_LR_MULTIPLIER}
])

for param_group in optimizer.param_groups:
    for param in param_group["params"]:
        param.data = param.data.to(model.device)

for name, module in model.named_modules():
    if hasattr(module, "lora_A"):
        module.lora_A.to(dtype=torch.float16, device=model.device)
    if hasattr(module, "lora_B"):
        module.lora_B.to(dtype=torch.float16, device=model.device)

# === Training args ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=base_lr,
    max_grad_norm=0.3,
    num_train_epochs=3,
    #evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    logging_steps=10,
    report_to="none",
    fp16=False,
    bf16=False,
    #max_new_tokens=128,
    #do_sample=True,
    #temperature=0.6,
    #top_p=0.9,
    #top_k=40,
    #repetition_penalty=1.1,
    #eos_token_id=tokenizer.eos_token_id
)

# === Train with SFTTrainer ===
trainer = SFTTrainer(
    model=model,
    #tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    dataset_text_field="text",
    #max_seq_length=512,
    args=training_args,
    optimizers=(optimizer, None)
)

trainer.train()


Map:   0%|          | 0/63 [00:00<?, ? examples/s]

LoRA A params: 196
LoRA B params: 196
trainable params: 36,929,536 || all params: 1,814,017,536 || trainable%: 2.0358




Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]



Step,Training Loss


TrainOutput(global_step=9, training_loss=180389.63888888888, metrics={'train_runtime': 24.5838, 'train_samples_per_second': 6.102, 'train_steps_per_second': 0.366, 'total_flos': 70540973291520.0, 'train_loss': 180389.63888888888, 'epoch': 2.88})

In [3]:
dummy_input = tokenizer("test", return_tensors="pt").to(model.device)
out = model(**dummy_input)
loss = out.logits.sum()
loss.backward()  # ✅ This should now work


In [4]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ADAPTER_PATH = "./lora_adapters"
MERGED_MODEL_PATH = "./merged_model"

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# Merge LoRA adapter
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model = model.merge_and_unload()
model.save_pretrained(MERGED_MODEL_PATH)

# Save tokenizer too
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.save_pretrained(MERGED_MODEL_PATH)


('./merged_model/tokenizer_config.json',
 './merged_model/special_tokens_map.json',
 './merged_model/tokenizer.json')

In [5]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

def generate(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7,
        top_p=0.95,
        do_sample=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 🔍 Example prompts
test_prompts = [
    "### Instruction: Answer the question about insurance below.\n### Question: What is the annual maximum for maximum per person in the basic plan?\n ### Answer: {answer} ###:",
    "### Instruction: Answer the question about insurance below.\n### Question: How much is reimbursed for emergency dental services?\n ### Answer: {answer} ###:",
    "### Instruction: Answer the question about insurance below.\n### Question: What documents are needed for a drug claim?\n ### Answer: {answer} ###:",
]

for i, prompt in enumerate(test_prompts):
    output = generate(prompt)
    print(f"\n=== Output {i+1} ===\n{output}\n")


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



=== Output 1 ===
### Instruction: Answer the question about insurance below.
### Question: What is the annual maximum for maximum per person in the basic plan?
 ### Answer: {answer} ###: {maxPerPerson}

### Instructions: Answer the question about insurance below.
### Question: What is the annual maximum for maximum per person in the basic plan?
### Answer: {maxPerPerson}

### Instructions: Answer the question about insurance below.
### Question: What is the annual maximum for maximum per person in the basic plan?
### Answer: {maxPerPerson}

### Instructions: Answer the question about insurance below.
### Question: What is the annual maximum for maximum per person in the basic



Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



=== Output 2 ===
### Instruction: Answer the question about insurance below.
### Question: How much is reimbursed for emergency dental services?
 ### Answer: {answer} ###: {reimbursement}
</think>

To determine the reimbursement for emergency dental services, it is essential to consult official insurance policies or consult with an insurance professional. The reimbursement amount may depend on factors such as the type of insurance plan, the duration of the policy, and the specific emergency care services provided.


=== Output 3 ===
### Instruction: Answer the question about insurance below.
### Question: What documents are needed for a drug claim?
 ### Answer: {answer} ###: {answer}

The answer above is {answer}, but the question is about insurance. So, I need to provide information about insurance coverage.
</think>

When seeking a drug claim, the necessary documents to provide include:

1. **Proof of Address**: A valid official address or proof of residence, typically a utility bil