In [1]:
! pip install bitsandbytes -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import json

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
sheet_path = 'Ka-ChatBot_BenchMark.xlsx'
faq_df = pd.read_excel(sheet_path, sheet_name="faq")
samples_df = pd.read_excel(sheet_path, sheet_name="samples")

faqs = faq_df['faq'].tolist()
ids = faq_df['idx'].tolist()
categories = faq_df['category'].tolist()

samples = samples_df['sample'].tolist()
labels = torch.tensor(samples_df['gt_idx'].tolist())

faq_list = zip(ids, faqs, categories)


In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,    # nested quantization (saves memory)
    bnb_4bit_quant_type="nf4",         # recommended quantization data type
    bnb_4bit_compute_dtype=torch.float16,  # computations in float16
)


tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507")
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-4B-Instruct-2507",
    device_map=device,
    quantization_config=bnb_config,
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

In [13]:
tokenizer.padding_side = 'left'
# tokenizer.pad_token = tokenizer.eos_token
# model.generation_config.pad_token_id = tokenizer.pad_token_id

In [14]:
instruction = """You are given a user question in Persian, a list of predefined questions with IDs, and the category each question belongs to. Your task is to find the single most semantically similar question.

Steps:
1. Determine which category the user question belongs to. Focus on meaning and intent.
2. Compare the user question only to questions within that category.
3. Choose the question whose intent or meaning best matches the user question.
"""
category_group = {}
for (id, q, cat) in zip(ids, faqs, categories):
  category_group[cat] = category_group.get(cat, []) + [(id, q)]
faq_list = "Predefined questions:\n"
for cat in category_group.keys():
  faq_list += f"\n\nCategory: {cat}\n" + '\n'.join(f"id: {id} - {q}" for id, q in category_group[cat])

rules = """Rules:
- All questions are in Persian. Do not translate, only compare.
- Return only a JSON object in the format: {"id": "<id>"}
- Do not include explanations or any other text."""


sysprompt = f"{instruction}\n\n{faq_list}\n\n{rules}"

In [24]:
# best performace
instruction = """You are given a user’s question and a list of predefined questions with IDs.
Your task is to find the most semantically similar predefined question."""

faq_list = "Predefined questions:\n" + '\n'.join(f"id: {id} - {q}" for (id, q, cat) in zip(ids, faqs, categories))

rules = """Rules:
- Compare based on meaning, not just keywords.
- All questions are in Persian. Do not translate, only compare.
- Return only a JSON object in the format: {"id": "<id>"}.
- Do not include explanations or any other text."""


sysprompt = f"{instruction}\n\n{faq_list}\n\n{rules}"

In [29]:
instruction = """You are given a user question in Persian, a list of predefined questions with IDs, and the category each question belongs to. Your task is to find the single most semantically similar question."""
category_group = {}
for (id, q, cat) in zip(ids, faqs, categories):
  category_group[cat] = category_group.get(cat, []) + [(id, q)]
faq_list = "Predefined questions:\n"
for cat in category_group.keys():
  faq_list += f"\n\nCategory: {cat}\n" + '\n'.join(f"id: {id} - {q}" for id, q in category_group[cat])

rules = """

Steps:
0. All questions are in Persian. Do not translate, only compare.
1. Determine which category the user question belongs to. Focus on meaning and intent.
2. Compare the user question only to questions within that category.
3. Choose the question whose intent or meaning best matches the user question.
4. Return only a JSON object in the format: {"id": "<id>"}
5. Do not include explanations or any other text."""


sysprompt = f"{instruction}\n\n{faq_list}\n\n{rules}"

In [25]:
print(sysprompt)

You are given a user’s question and a list of predefined questions with IDs.
Your task is to find the most semantically similar predefined question.

Predefined questions:
id: 1 - چرا من در احراز هویت افتتاح حساب خطای عدم تطبیق تصویر می گیرم؟ 
id: 2 - جگونه افتتاح حساب غیر حضوری کنم ؟
id: 3 - چگونه میتوانم حساب خودم را ببندم؟
id: 4 - چگونه پرداخت قبض با حساب/کارت انجام دهم؟
id: 5 - چرا در زمان کارت به کارت اعلام می کند شماره در سامانه شاهکار ثبت نشده
id: 6 - چگونه کارت را از مسدودی در بیاورم؟
id: 7 - چگونه کارت خود را مسدود کنم؟ 
id: 8 - با سلام چه جوري قسط وام رو پرداخت کنيم ؟
id: 9 - چگونه رمز پویا را فعال کنم ؟
id: 10 - چرا من در باشگاه مشتریان نمی تونم تسهیلات بالاتر از 5 میلیون تومان را دریافت کنم ؟ 
id: 11 - نحوه امتیاز دهی در باشگاه مشتریان چگونه است؟
id: 12 - با چه رتبه ای، می‌توانم کدام تسهیلات را دریافت کنم ؟ 
id: 13 - چگونه رمز ورود خود را تغییر دهم ؟
id: 14 - چگونه رمز اول برای کارت خود انتخاب کنم ؟
id: 15 - چگونه رمز اول کارت خود را تغییر دهم ؟
id: 16 - من یک واریزی داشتم 

In [26]:
class QuestionDataset(Dataset):
  def __init__(self, questions, system_prompt):
    self.questions = questions
    self.system_prompt = system_prompt

  def __len__(self):
    return len(self.questions)

  def __getitem__(self, idx):
    messages = [
      {"role": "system", "content": self.system_prompt},
      {"role": "user", "content": self.questions[idx]},
    ]
    return messages


In [27]:
def collate_fn(batch):
  return tokenizer.apply_chat_template(
      batch, padding=True,
      add_generation_prompt=True,
      return_tensors='pt',
      return_dict=True).to(device)

In [28]:
userqs_ds = QuestionDataset(samples, sysprompt)
dataloader = DataLoader(userqs_ds, shuffle=False, batch_size=16, collate_fn=collate_fn)

In [29]:
allresponses = []
with torch.no_grad():
  for inputs in dataloader:
    outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
    new_tokens = outputs[:, inputs["input_ids"].shape[-1]:-1]
    responses = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
    allresponses = allresponses + responses


In [30]:
pids = []
for resp in allresponses:
  try:
    pids.append(int(json.loads(resp)['id']))
  except:
    pids.append(0)
pids = torch.tensor(pids)

In [31]:
pids.count_nonzero()

tensor(93)

In [32]:
(pids == labels).sum() / len(labels)

tensor(0.8280)

In [34]:
print(pids)

tensor([ 1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  3,  2,  2,  4,  4,  4,  5,
         5, 13,  5,  5, 20,  5,  5,  2,  1,  6,  6,  6,  6,  6,  6,  6,  6,  3,
         8,  8,  8,  8,  8,  8,  8,  9,  9, 13,  9,  9, 10, 10, 12, 12, 10, 10,
        12, 11, 11, 11, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 14, 16, 16, 16,
        17, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 22, 22, 23,
        23, 23, 24])


In [35]:
resultdf = pd.read_csv('results.csv')
resultdf['qwen3-4b-instruct-2507'] = pids
resultdf.to_csv('newresults.csv', index=False)