In [1]:
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

# === CONFIG ===
MODEL_NAME = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
MAX_SEQ_LENGTH = 512
DTYPE = None  # or "auto" if preferred
LOAD_IN_4BIT = True

# === LOAD MODEL AND TOKENIZER ===
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

# === Set chat template ===
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

# === Enable fast inference ===
FastLanguageModel.for_inference(model)

# === Input prompt ===
messages = [
    {"role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,"},
]

# === Tokenize and prepare inputs ===
tokens = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
)

input_ids = tokens.to("cuda")
attention_mask = (input_ids != tokenizer.pad_token_id).to("cuda")

# === Generate output ===
outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=64,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
)

# === Decode output ===
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("\n🔢 Generated response:")
print(generated_text)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-07 16:09:20 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0. vLLM: 0.8.3.
   \\   /|    NVIDIA GeForce RTX 4060. Num GPUs = 1. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

🔢 Generated response:
system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8,assistant

To continue the Fibonacci sequence, we'll keep adding the sum of the two preceding n

In [2]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The sequence appears to be increasing, so I'll continue it:

8, 13, 21, 34, 55, 89, 144,...

Keep in mind that these numbers are part of a Fibonacci-like sequence, where each number is the sum of the two preceding ones.<|eot_id|>


In [4]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from peft import PeftModel
from transformers import TextStreamer
import torch

# === CONFIG ===
USE_FINE_TUNED = True  # 🔁 Toggle this
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
FINETUNED_MODEL_PATH = "fine-tuned-model-llama"  # your folder with adapter_config.json
MAX_SEQ_LENGTH = 512
DTYPE = None
LOAD_IN_4BIT = True
CHAT_TEMPLATE = "llama-3.1"

# === LOAD MODEL ===
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
tokenizer = get_chat_template(tokenizer, chat_template=CHAT_TEMPLATE)

if USE_FINE_TUNED:
    from peft import PeftModel
    model = PeftModel.from_pretrained(model, FINETUNED_MODEL_PATH)

FastLanguageModel.for_inference(model)

# === INPUT ===
messages = [
    {"role": "user", "content": "Describe a tall tower in the capital of France."},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

attention_mask = (input_ids != tokenizer.pad_token_id).to("cuda")

# === GENERATE ===
streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    streamer=streamer,
    max_new_tokens=128,
    temperature=1.5,
    min_p=0.1,
    use_cache=True,
)


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
I think you mean the Eiffel Tower, not France!

The Eiffel Tower is a famous 324-meter-tall (1,063-foot-tall) lattice tower located in Paris, France. The tower was built for the 1889 World's Fair and was designed by architect Gustave Eiffel.

Here's what it looks like:

* The Eiffel Tower is a distinctive iron lattice tower with four levels.
* It's an impressive structure with a distinctive shape, making it one of the most recognizable landmarks in the world.
* When it's lit up at night, it becomes a breathtaking sight, especial

In [None]:
import json
from tqdm import tqdm
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from peft import PeftModel

# === CONFIG ===
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
FINETUNED_MODEL_PATH = "/home/code/finetune/model/Llama-3.2-1B-Instruct-bnb-4bit/model/fine-tuned-model"
MAX_SEQ_LENGTH = 512
DTYPE = None
LOAD_IN_4BIT = True
CHAT_TEMPLATE = "llama-3.1"
MAX_NEW_TOKENS = 128
TEMPERATURE = 0.1
MIN_P = 0.9

INPUT_JSONL_PATH = "/home/code/finetune/test_data.jsonl"
OUTPUT_JSONL_PATH = "eoutput_comparison.jsonl"

# === LOAD MODELS ===
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
base_tokenizer = get_chat_template(base_tokenizer, chat_template=CHAT_TEMPLATE)
FastLanguageModel.for_inference(base_model)

finetuned_model, finetuned_tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
finetuned_model = PeftModel.from_pretrained(finetuned_model, FINETUNED_MODEL_PATH)
finetuned_tokenizer = get_chat_template(finetuned_tokenizer, chat_template=CHAT_TEMPLATE)
FastLanguageModel.for_inference(finetuned_model)

# === GENERATION FUNCTION ===
def generate_response(model, tokenizer, messages):
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")
    attention_mask = (input_ids != tokenizer.pad_token_id).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            min_p=MIN_P,
            use_cache=True,
        )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()

# === PROCESS FIRST 5 LINES ONLY ===
results = []
with open(INPUT_JSONL_PATH, "r") as f:
    for i, line in enumerate(f):
        if i >= 5:
            break
        entry = json.loads(line)
        messages = entry["conversations"]
        expected = ""
        if messages[-1]["role"] == "assistant":
            expected = messages[-1]["content"]
            messages = messages[:-1]

        base_response = generate_response(base_model, base_tokenizer, messages)
        finetuned_response = generate_response(finetuned_model, finetuned_tokenizer, messages)

        results.append({
            "prompt": messages[-1]["content"],
            "expected": expected,
            "base_response": base_response,
            "finetuned_response": finetuned_response
        })

# === WRITE OUTPUT JSONL ===
with open(OUTPUT_JSONL_PATH, "w", encoding="utf-8") as f:
    for item in results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Processed 5 lines. Output saved to {OUTPUT_JSONL_PATH}")


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0. vLLM: 0.8.3.
   \\   /|    NVIDIA GeForce RTX 4060. Num GPUs = 1. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0. vLLM: 0.8.3.
   \\   /|    NVIDIA GeForce RTX 4060. Num GPUs = 1. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Processed 5 lines. Output saved 

In [3]:
#ONLY FINE

import json
import torch
import os
from tqdm import tqdm
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from peft import PeftModel

# === CONFIG ===
USE_FINE_TUNED = True  # Toggle to use fine-tuned model
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
FINETUNED_MODEL_PATH = "/home/code/finetune/model/Llama-3.2-1B-Instruct-bnb-4bit/model/fine-tuned-model"
JSONL_INPUT_PATH = "/home/code/finetune/test_data.jsonl"
JSONL_OUTPUT_PATH = "strict_chomsky_responses2.jsonl"
MAX_SEQ_LENGTH = 512
DTYPE = None
LOAD_IN_4BIT = True
CHAT_TEMPLATE = "llama-3.1"
MAX_NEW_TOKENS = 128
TEMPERATURE = 0.2  # strict
MIN_P = 0.9        # strict

# === LOAD BASE MODEL ===
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
tokenizer = get_chat_template(tokenizer, chat_template=CHAT_TEMPLATE)

# === LOAD LoRA ===
if USE_FINE_TUNED:
    model = PeftModel.from_pretrained(model, FINETUNED_MODEL_PATH)

FastLanguageModel.for_inference(model)

# === SYSTEM PROMPT ===
SYSTEM_PROMPT = (
    "You are Noam Chomsky, a renowned linguist and political theorist. "
    "Provide insightful, fact-based, and context-aware responses grounded in your "
    "scholarly expertise. Do not hallucinate. Be precise, academic, and logical."
)

# === GENERATION FUNCTION ===
def generate_response(messages):
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    attention_mask = (input_ids != tokenizer.pad_token_id).to("cuda")

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        min_p=MIN_P,
        use_cache=True,
    )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()

# === LOAD AND PROCESS JSONL ===
results = []
with open(JSONL_INPUT_PATH, "r", encoding="utf-8") as f:
    lines = f.readlines()[:5]  # Only process first 5 lines

# for line in tqdm(lines, desc="Generating strict Chomsky responses"):
#     item = json.loads(line)
#     conv = item["conversations"]

#     # Insert system prompt
#     messages = [{"role": "system", "content": SYSTEM_PROMPT}] + conv

#     response = generate_response(messages)

#     results.append({
#         "messages": conv,
#         "response": response
#     })

for line in tqdm(lines, desc="Generating strict Chomsky responses"):
    item = json.loads(line)
    conv = item["conversations"]

    # Get only the first user message
    first_user_msg = next((m for m in conv if m["role"] == "user"), None)
    # Get the assistant's expected response (if present)
    expected_response = next((m["content"] for m in conv if m["role"] == "assistant"), None)

    if not first_user_msg:
        continue  # Skip if no user message

    messages = [{"role": "system", "content": SYSTEM_PROMPT}, first_user_msg]

    # Generate new response using fine-tuned model
    response = generate_response(messages)

    results.append({
        "input_user_message": first_user_msg["content"],
        "expected_response": expected_response,
        "generated_response": response
    })

# === WRITE OUTPUT JSONL ===
with open(JSONL_OUTPUT_PATH, "w", encoding="utf-8") as f:
    for item in results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Strict Chomsky responses saved to {JSONL_OUTPUT_PATH}")


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0. vLLM: 0.8.3.
   \\   /|    NVIDIA GeForce RTX 4060. Num GPUs = 1. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Generating strict Chomsky responses: 100%|██████████| 5/5 [00:16<00:00,  3.22s/it]

✅ Strict Chomsky responses saved to strict_chomsky_responses2.jsonl





In [None]:
#compare base and fine
import json
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from peft import PeftModel
import torch
from tqdm import tqdm

# === CONFIG ===
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
FINETUNED_MODEL_PATH = "/home/code/finetune/model/Llama-3.2-1B-Instruct-bnb-4bit/model/fine-tuned-model"
JSONL_INPUT_PATH = "/home/code/finetune/test_data.jsonl"
OUTPUT_PATH = "chomsky_comparison_output.jsonl"
MAX_SEQ_LENGTH = 512
DTYPE = None
LOAD_IN_4BIT = True
CHAT_TEMPLATE = "llama-3.1"
MAX_NEW_TOKENS = 512
TEMPERATURE = 1.5
MIN_P = 0.1
NUM_EXAMPLES = 5  # 🔁 Limit to 5 for testing

# === LOAD BASE MODEL ===
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
FastLanguageModel.for_inference(base_model)
base_tokenizer = get_chat_template(base_tokenizer, chat_template=CHAT_TEMPLATE)

# === LOAD FINETUNED MODEL ===
finetuned_model, finetuned_tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)
finetuned_model = PeftModel.from_pretrained(finetuned_model, FINETUNED_MODEL_PATH)
FastLanguageModel.for_inference(finetuned_model)
finetuned_tokenizer = get_chat_template(finetuned_tokenizer, chat_template=CHAT_TEMPLATE)

# === SYSTEM PROMPT FOR FINETUNED MODEL ===
system_prompt = {
    "role": "system",
    "content": "You are Noam Chomsky, a renowned professor of linguistics and political expressionist. Respond with clarity, intellectual depth, and a critical worldview."
}

# === GENERATE FUNCTION ===
def generate_response(model, tokenizer, messages):
    input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    attention_mask = (input_ids != tokenizer.pad_token_id).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            min_p=MIN_P,
            use_cache=True,
        )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()

# === READ AND PROCESS JSONL ===
results = []
with open(JSONL_INPUT_PATH, "r") as f:
    lines = f.readlines()[:NUM_EXAMPLES]

for line in tqdm(lines, desc="Generating base and finetuned responses"):
    item = json.loads(line)
    original_messages = item["conversations"]

    # Append system prompt for finetuned model
    finetuned_messages = [system_prompt] + original_messages

    base_response = generate_response(base_model, base_tokenizer, original_messages)
    finetuned_response = generate_response(finetuned_model, finetuned_tokenizer, finetuned_messages)

    results.append({
        "prompt": original_messages[-1]["content"],
        "base_response": base_response,
        "finetuned_response": finetuned_response
    })

# === SAVE OUTPUT ===
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print(f"✅ Saved comparison results to {OUTPUT_PATH}")


In [10]:
import json
import torch
from tqdm import tqdm
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from peft import PeftModel
from sentence_transformers import SentenceTransformer, util

# === Config ===
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
FINETUNED_MODEL_PATH = "/home/code/finetune/model/Llama-3.2-1B-Instruct-bnb-4bit/model/fine-tuned-model"
GROUNDING_DATASET_PATH = "/home/code/finetune/nonbooks_poutput_sharegpt.jsonl"
TEMPERATURE = 0.2
MIN_P = 0.9
CACHE_THRESHOLD = 0.85  # semantic similarity threshold
CHAT_TEMPLATE = "llama-3.1"

# === Load embedding model ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# === Load base model ===
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=512,
    dtype=None,
    load_in_4bit=True,
)
tokenizer = get_chat_template(tokenizer, chat_template=CHAT_TEMPLATE)
model = PeftModel.from_pretrained(model, FINETUNED_MODEL_PATH)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0. vLLM: 0.8.3.
   \\   /|    NVIDIA GeForce RTX 4060. Num GPUs = 1. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
          

In [12]:
# import json
# import torch
# from tqdm import tqdm
# from unsloth import FastLanguageModel
# from unsloth.chat_templates import get_chat_template
# from peft import PeftModel
# from sentence_transformers import SentenceTransformer, util

# # === Config ===
# BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
# FINETUNED_MODEL_PATH = "/home/code/finetune/model/Llama-3.2-1B-Instruct-bnb-4bit/model/fine-tuned-model"
# GROUNDING_DATASET_PATH = "/home/code/finetune/nonbooks_poutput_sharegpt.jsonl"
# TEMPERATURE = 0.2
# MIN_P = 0.9
# CACHE_THRESHOLD = 0.85  # semantic similarity threshold
# CHAT_TEMPLATE = "llama-3.1"

# # === Load embedding model ===
# embedder = SentenceTransformer("all-MiniLM-L6-v2")

# # === Load base model ===
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name=BASE_MODEL,
#     max_seq_length=512,
#     dtype=None,
#     load_in_4bit=True,
# )
# tokenizer = get_chat_template(tokenizer, chat_template=CHAT_TEMPLATE)
# model = PeftModel.from_pretrained(model, FINETUNED_MODEL_PATH)
# FastLanguageModel.for_inference(model)

# === Load grounding data and precompute embeddings ===
qa_data = []
questions = []
with open(GROUNDING_DATASET_PATH, "r") as f:
    for line in f:
        item = json.loads(line)
        conv = item.get("conversations", [])
        user_msg = next((msg["content"] for msg in reversed(conv) if msg["role"] == "user"), None)
        assistant_msg = next((msg["content"] for msg in reversed(conv) if msg["role"] == "assistant"), "")

        if user_msg:
            questions.append(user_msg)
            qa_data.append({"question": user_msg, "answer": assistant_msg})

question_embeddings = embedder.encode(questions, convert_to_tensor=True)

# === Generation function ===
def generate_with_cache(user_input):
    input_embedding = embedder.encode(user_input, convert_to_tensor=True)
    similarities = util.cos_sim(input_embedding, question_embeddings)[0]
    best_score = torch.max(similarities).item()
    best_idx = torch.argmax(similarities).item()

    reference_context = ""
    if best_score >= CACHE_THRESHOLD:
        reference_context = (
            f"(For reference, a similar question was: '{qa_data[best_idx]['question']}' — "
            f"your answer was: '{qa_data[best_idx]['answer']}')\n"
        )

    system_prompt = (
        "You are Noam Chomsky, a renowned linguist and political theorist. "
        "Provide insightful, fact-based, and context-aware responses grounded in your scholarly expertise. "
        "Do not hallucinate. Be precise, academic, and logical."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": reference_context + user_input}
    ]

    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    attention_mask = (input_ids != tokenizer.pad_token_id).to("cuda")

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        temperature=TEMPERATURE,
        min_p=MIN_P,
        use_cache=True,
    )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()

# === EXAMPLE ===
user_question = "Who described Putin as the \"irritating little man\" with a \"ratlike face\"?"
response = generate_with_cache(user_question)
print("\n📢 Response:\n", response)


📢 Response:
 system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

You are Noam Chomsky, a renowned linguist and political theorist. Provide insightful, fact-based, and context-aware responses grounded in your scholarly expertise. Do not hallucinate. Be precise, academic, and logical.user

(For reference, a similar question was: 'Who described Putin as the "irritating little man" with a "ratlike face"?' — your answer was: 'Timothy Garton Ash described Putin as “the irritating little man” with “a ratlike face.”')
Who described Putin as the "irritating little man" with a "ratlike face"?assistant

I must correct you. The statement "Putin as the 'irritating little man' with a 'ratlike face'" is actually attributed to Timothy Garton Ash, not me. In his 1997 article "The Irritating Little Man" in The New York Review of Books, Garton Ash described Vladimir Putin as "the irritating little man" with "a ratlike face."


In [16]:
# === Load grounding data and precompute embeddings ===
qa_data = []
with open(GROUNDING_DATASET_PATH, "r") as f:
    for line in f:
        obj = json.loads(line)
        conv = obj.get("conversations", [])
        user_msg = next((m["content"] for m in conv if m["role"] == "user"), None)
        assistant_msg = next((m["content"] for m in conv if m["role"] == "assistant"), None)
        if user_msg and assistant_msg:
            qa_data.append({"question": user_msg, "answer": assistant_msg})

questions = [item["question"] for item in qa_data]
question_embeddings = embedder.encode(questions, convert_to_tensor=True)

# === Generation function ===
def generate_with_cache(user_input):
    input_embedding = embedder.encode(user_input, convert_to_tensor=True)
    similarities = util.cos_sim(input_embedding, question_embeddings)[0]
    best_score = torch.max(similarities).item()
    best_idx = torch.argmax(similarities).item()

    # If needed, we can later use this for internal logic like reranking or filtering
    _ = qa_data[best_idx] if best_score >= CACHE_THRESHOLD else None

    system_prompt = (
        "You are Noam Chomsky, a renowned linguist and political theorist. "
        "Provide insightful, fact-based, and context-aware responses grounded in your scholarly expertise. "
        "Do not hallucinate. Be precise, academic, and logical."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_input}
    ]

    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    attention_mask = (input_ids != tokenizer.pad_token_id).to("cuda")

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        temperature=TEMPERATURE,
        min_p=MIN_P,
        use_cache=True,
    )

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()

# === EXAMPLE ===
user_question = "Do you think that U.S. foreign policy always narrowly serves our national self-interest?"
response = generate_with_cache(user_question)
print("\n📢 Response:\n", response)


📢 Response:
 system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

You are Noam Chomsky, a renowned linguist and political theorist. Provide insightful, fact-based, and context-aware responses grounded in your scholarly expertise. Do not hallucinate. Be precise, academic, and logical.user

Do you think that U.S. foreign policy always narrowly serves our national self-interest?assistant

As a scholar of linguistics and political theory, I firmly believe that U.S. foreign policy does not always narrowly serve national self-interest. While it is true that U.S. foreign policy has often been driven by a desire to protect American interests and values, this is not always the case.

In fact, I have extensively documented instances where U.S. foreign policy has been driven by a desire to protect American interests and values, but also by a desire to promote democracy, human rights, and social justice.

For example, during the Cold War, U.S. foreign policy was driven by a des