In [None]:
! rm -rf sample_data/

# pip installs

In [None]:
! pip install bitsandbytes -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# imports

In [1]:
import pandas as pd
import torch
import time
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, Dataset
import json

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

sheet_path = '/content/Ka-ChatBot_BenchMark.xlsx'

encoder_model_id = "mohammad-osoolian/arman-tooka2-finetuned"
llm_model_id = "Qwen/Qwen3-4B-Instruct-2507"

# Load Arman Bank sheet and process

In [3]:
faq_df = pd.read_excel(sheet_path, sheet_name="faq")
samples_df = pd.read_excel(sheet_path, sheet_name="samples")

faqs = faq_df['faq'].tolist()
ids = faq_df['idx'].tolist()
categories = faq_df['category'].tolist()

samples = samples_df['sample'].tolist()
labels = torch.tensor(samples_df['gt_idx'].tolist())

faq_list = zip(ids, faqs, categories)


# Load LLM model and sentence-transformer model

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,    # nested quantization (saves memory)
    bnb_4bit_quant_type="nf4",         # recommended quantization data type
    bnb_4bit_compute_dtype=torch.float16,  # computations in float16
)


tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
llm_model = AutoModelForCausalLM.from_pretrained(
    llm_model_id,
    device_map=device,
    quantization_config=bnb_config,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
tokenizer.padding_side = 'left'
# tokenizer.pad_token = tokenizer.eos_token # not used for qwen3
# model.generation_config.pad_token_id = tokenizer.pad_token_id # not used for qwen3

In [6]:
encoder_model = SentenceTransformer(encoder_model_id)

# utils and functions

In [None]:
# make system prompt
# best performace for qwen3
instruction = """You are given a user’s question and a list of predefined questions with IDs.
Your task is to find the most semantically similar predefined question."""

faq_list = "Predefined questions:\n" + '\n'.join(f"id: {id} - {q}" for (id, q, cat) in zip(ids, faqs, categories))

rules = """Rules:
- Compare based on meaning, not just keywords.
- All questions are in Persian. Do not translate, only compare.
- Return only a JSON object in the format: {"id": "<id>"}.
- Do not include explanations or any other text."""


sysprompt = f"{instruction}\n\n{faq_list}\n\n{rules}"
print(sysprompt)

You are given a user’s question and a list of predefined questions with IDs.
Your task is to find the most semantically similar predefined question.

Predefined questions:
id: 1 - چرا من در احراز هویت افتتاح حساب خطای عدم تطبیق تصویر می گیرم؟ 
id: 2 - جگونه افتتاح حساب غیر حضوری کنم ؟
id: 3 - چگونه میتوانم حساب خودم را ببندم؟
id: 4 - چگونه پرداخت قبض با حساب/کارت انجام دهم؟
id: 5 - چرا در زمان کارت به کارت اعلام می کند شماره در سامانه شاهکار ثبت نشده
id: 6 - چگونه کارت را از مسدودی در بیاورم؟
id: 7 - چگونه کارت خود را مسدود کنم؟ 
id: 8 - با سلام چه جوري قسط وام رو پرداخت کنيم ؟
id: 9 - چگونه رمز پویا را فعال کنم ؟
id: 10 - چرا من در باشگاه مشتریان نمی تونم تسهیلات بالاتر از 5 میلیون تومان را دریافت کنم ؟ 
id: 11 - نحوه امتیاز دهی در باشگاه مشتریان چگونه است؟
id: 12 - با چه رتبه ای، می‌توانم کدام تسهیلات را دریافت کنم ؟ 
id: 13 - چگونه رمز ورود خود را تغییر دهم ؟
id: 14 - چگونه رمز اول برای کارت خود انتخاب کنم ؟
id: 15 - چگونه رمز اول کارت خود را تغییر دهم ؟
id: 16 - من یک واریزی داشتم 

In [8]:
# custom dataset that convert samples to chat message format for llm inference

class QuestionDataset(Dataset):
  def __init__(self, questions, system_prompt):
    self.questions = questions
    self.system_prompt = system_prompt

  def __len__(self):
    return len(self.questions)

  def __getitem__(self, idx):
    messages = [
      {"role": "system", "content": self.system_prompt},
      {"role": "user", "content": self.questions[idx]},
    ]
    return messages


In [9]:
# collator function that does the tokenizing and padding

def collate_fn(batch):
  return tokenizer.apply_chat_template(
      batch, padding=True,
      add_generation_prompt=True,
      return_tensors='pt',
      return_dict=True).to(device)

# Prediction

## LLM and prompt

In [10]:
userqs_ds = QuestionDataset(samples, sysprompt)
dataloader = DataLoader(userqs_ds, shuffle=False, batch_size=16, collate_fn=collate_fn)

In [11]:
allresponses = []
start = time.time()
with torch.no_grad():
  for inputs in dataloader:
    outputs = llm_model.generate(**inputs, max_new_tokens=50, do_sample=False) # do_sample False -> deterministic results
    new_tokens = outputs[:, inputs["input_ids"].shape[-1]:-1]
    responses = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
    allresponses = allresponses + responses
end = time.time()

llm_predict_time = (end - start) * 1000 # in ms

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [12]:
print(f"LLM Inference Time for 93 Samples in 6 batchs: {llm_predict_time:.0f} ms")

LLM Inference Time for 93 Samples in 6 batchs: 61925 ms


In [13]:
llm_predicts = []
for resp in allresponses:
  try:
    llm_predicts.append(int(json.loads(resp)['id']))
  except:
    llm_predicts.append(0)
llm_predicts = torch.tensor(llm_predicts)
llm_acc = ((llm_predicts == labels).sum() / len(labels)).item()

In [14]:
print(f"LLM Accuracy: {llm_acc:0.3f}")

LLM Accuracy: 0.828


## sentence encoder

In [23]:
start = time.time()
faqs_emb = encoder_model.encode(faqs)
samples_emb = encoder_model.encode(samples)
similarities = encoder_model.similarity(faqs_emb, samples_emb)

predict_indices = torch.max(similarities, axis=0)[1]
encoder_predicts = predict_indices + 1
end = time.time()

encoder_predict_time = (end - start) * 1000 # in ms

In [16]:
print(f"Sentence Encoder Inference Time for 93 Samples in one batch: {encoder_predict_time:.0f} ms")

Sentence Encoder Inference Time for 93 Samples in one batch: 1007 ms


In [24]:
encoder_acc = ((encoder_predicts == labels).sum() / len(labels)).item()

In [25]:
print(f"Sentence Encoder Accuracy: {encoder_acc:0.3f}")

Sentence Encoder Accuracy: 0.817


# Analysis

## inference time

In [183]:
batch_sizes = [1, 2, 4, 8, 16, 32]
llm_predict_times = []
for size in batch_sizes:
  dl = DataLoader(userqs_ds, shuffle=False, batch_size=size, collate_fn=collate_fn)
  batch = next(iter(dl))
  torch.cuda.empty_cache()

  start = time.time()
  with torch.no_grad():
    outputs = llm_model.generate(**batch, max_new_tokens=50, do_sample=False) # do_sample False -> deterministic results
    new_tokens = outputs[:, inputs["input_ids"].shape[-1]:-1]
    responses = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
  end = time.time()
  del outputs, new_tokens, responses
  torch.cuda.empty_cache()


  llm_predict_times.append((end - start) * 1000) # in ms


In [187]:
batch_sizes = [1, 2, 4, 8, 16, 32]
encoder_predict_times = []
for size in batch_sizes:
  batch = samples[:size]
  faqs_emb = encoder_model.encode(faqs)

  start = time.time()
  samples_emb = encoder_model.encode(batch)
  similarities = encoder_model.similarity(faqs_emb, samples_emb)

  predict_indices = torch.max(similarities, axis=0)[1]
  predicts = predict_indices + 1
  end = time.time()

  encoder_predict_times.append((end - start) * 1000) # in ms



In [202]:
print("#################### REPORT ########################\n")
print("LLM Inference Time for different batch sizes:")
print('\n'.join([f"batch size: {size} - time: {dur:.0f} ms" for size, dur in zip(batch_sizes, llm_predict_times)]))
print('-----------------------')
print("Sentence Encoder Inference Time for different batch sizes:")
print('\n'.join([f"batch size: {size} - time: {dur:.0f} ms" for (size, dur) in zip(batch_sizes, encoder_predict_times)]))
print('\n####################################################')

#################### REPORT ########################

LLM Inference Time for different batch sizes:
batch size: 1 - time: 1455 ms
batch size: 2 - time: 1848 ms
batch size: 4 - time: 3048 ms
batch size: 8 - time: 5113 ms
batch size: 16 - time: 10049 ms
batch size: 32 - time: 21567 ms
-----------------------
Sentence Encoder Inference Time for different batch sizes:
batch size: 1 - time: 33 ms
batch size: 2 - time: 33 ms
batch size: 4 - time: 55 ms
batch size: 8 - time: 97 ms
batch size: 16 - time: 189 ms
batch size: 32 - time: 374 ms

####################################################


## GPU needed

In [192]:
def get_model_size_gb(model: torch.nn.Module):
    """Rough GPU memory usage of a model’s parameters + buffers in MB."""
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    total_size = (param_size + buffer_size) / (1024 ** 3)
    return total_size

In [193]:
llm_model_size = get_model_size_gb(llm_model)
encoder_model_size = get_model_size_gb(encoder_model)

In [198]:
print("#################### REPORT ########################\n")
print(f"Load LLM Model GPU Needed: {llm_model_size:0.2f} GB")
print(f"Load Sentence Encoder Model GPU Needed: {encoder_model_size:0.2f} GB")
print('\n####################################################')

#################### REPORT ########################

Load LLM Model GPU Needed: 2.42 GB
Load Sentence Encoder Model GPU Needed: 1.32 GB

####################################################


In [171]:
def measure_inference_memory_gb(model, sample_input, device="cuda"):
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats(device)
    before = torch.cuda.max_memory_reserved(device) / (1024 ** 3)  # GB

    model = model.to(device)
    model.eval()
    with torch.no_grad():
        if type(model) == type(llm_model):
            _ = model.generate(**sample_input, do_sample=False, max_new_tokens=50)
        elif type(model) == type(encoder_model):
            _ = model.encode(sample_input)
    after = torch.cuda.max_memory_reserved(device) / (1024 ** 3)  # GB
    return after - before

In [158]:
batch_sizes = [1, 2, 4, 8, 16, 32]
llm_inference_memories = []
for size in batch_sizes:
  dl = DataLoader(userqs_ds, shuffle=False, batch_size=size, collate_fn=collate_fn)
  batch = next(iter(dl))
  torch.cuda.empty_cache()
  inference_memory = measure_inference_memory_gb(llm_model, batch)
  llm_inference_memories.append(inference_memory)
  torch.cuda.empty_cache()


In [174]:
batch_sizes = [1, 2, 4, 8, 16, 32]
encoder_inference_memories = []
for size in batch_sizes:
  batch = samples[:size]
  torch.cuda.empty_cache()
  inference_memory = measure_inference_memory_gb(encoder_model, batch)
  encoder_inference_memories.append(inference_memory)
  torch.cuda.empty_cache()




In [199]:
print("#################### REPORT ########################\n")
print("LLM Model Inference Memory:")
print('\n'.join([f"batch size: {size} - memory: {memo*1000:.0f} MB" for size, memo in zip(batch_sizes, llm_inference_memories)]))
print('--------------------------------------')
print("Sentence Encoder Model Inference Memory:")
print('\n'.join([f"batch size: {size} - memory: {memo*1000:.0f} MB" for size, memo in zip(batch_sizes, encoder_inference_memories)]))
print('\n####################################################')

#################### REPORT ########################

LLM Model Inference Memory:
batch size: 1 - memory: 514 MB
batch size: 2 - memory: 541 MB
batch size: 4 - memory: 1051 MB
batch size: 8 - memory: 2221 MB
batch size: 16 - memory: 4451 MB
batch size: 32 - memory: 8859 MB
--------------------------------------
Sentence Encoder Model Inference Memory:
batch size: 1 - memory: 2 MB
batch size: 2 - memory: 2 MB
batch size: 4 - memory: 6 MB
batch size: 8 - memory: 20 MB
batch size: 16 - memory: 51 MB
batch size: 32 - memory: 135 MB

####################################################


## case study

In [26]:
def category_accuracy(predics):
  true_categories = 0
  for i in range(len(labels)):
    if categories[labels[i] - 1] == categories[predicts[i] - 1]:
      true_categories += 1
  return true_categories / len(predicts)


In [36]:
llm_cat_acc = category_accuracy(llm_predicts)
llm_error_parsing = len(llm_predicts) - llm_predicts.count_nonzero()
encoder_cat_acc = category_accuracy(encoder_predicts)

In [200]:
print("#################### REPORT ########################\n")
print(f"LLM Accuracy: {llm_acc:.3f} - LLM Category Accuracy: {llm_cat_acc:.3f} - LLM Error Parsing Json: {llm_error_parsing}")
print(f"Sentence Encoder Accuracy: {encoder_acc:.3f} - Sentence Encoder Category Accuracy: {encoder_cat_acc:.3f}")
print('\n####################################################')

#################### REPORT ########################

LLM Accuracy: 0.828 - LLM Category Accuracy: 0.914 - LLM Error Parsing Json: 0
Sentence Encoder Accuracy: 0.817 - Sentence Encoder Category Accuracy: 0.914

####################################################


In [46]:
llm_predicts_mask = llm_predicts == labels
encoder_predicts_mask = encoder_predicts == labels
both_true = llm_predicts_mask & encoder_predicts_mask
both_wrong = ~(llm_predicts_mask | encoder_predicts_mask)
only_llm_true = llm_predicts_mask & (~encoder_predicts_mask)
only_encoder_true = encoder_predicts_mask & (~llm_predicts_mask)

In [201]:
print("#################### REPORT ########################\n")
print(f"Num of Easy Samples: {both_true.sum()}")
print(f"Num of Hard Samples: {both_wrong.sum()}")

print(f"LLM Only: {only_llm_true.sum()}")
print(f"Sentence Encoder Only: {only_encoder_true.sum()}")
print('\n####################################################')

#################### REPORT ########################

Num of Easy Samples: 67
Num of Hard Samples: 7
LLM Only: 10
Sentence Encoder Only: 9

####################################################


In [55]:
print("Hard Samples")
samples_df[both_wrong.tolist()]

Hard Samples


Unnamed: 0,sample,gt_idx
10,خب نمیشه کلا درخواست افتتاح حساب رو لغو کنید؟,3
12,سلام وقت به خیر من درخواست افتتاح حساب ثبت کرد...,3
13,سلام درخواست افتتاح حساب جدید دادم ولی منصرف ش...,3
34,سلام وقت بخیر\nکارت من به خاطر رمز اشتباه مسدو...,7
54,سلام صبحتون بخیر . چرا من با امتیاز ۹۰۰ هزار ت...,10
59,سلام وقت بخیر \nبرای حساب جاری هم امتیاز میدین...,12
71,سلام وقتتون بخیر چطوری میتونم واریزای ب کارتخو...,17


In [58]:
print("LLM Only")
samples_df[only_llm_true.tolist()]

LLM Only


Unnamed: 0,sample,gt_idx
11,سلام وقت بخیر بنده قبلا ثبت نام کردم ولی الان ...,3
20,سلام چرانیمتونم کارت به کارت کنم رمزپویابرام ن...,5
32,سلام \nرمز کارت را از طریق تخصیص انتخاب کردم ...,6
38,درون برنامه هم پرداخت قسط رو میزنم ولی برداشت ...,8
56,سلام چرا از وقتی واریز به حساب داشتم، امتیاز م...,11
58,سلام خسته نباشید امتیاز تسهیلات من چرا زیاد نم...,11
65,سلام کارتمو گرفتم رمز ندارم چجوری رمزشو بگیرم,14
67,با سلام برای تخصیص وتغییر رمز گزینه اش فعال ن...,15
73,سلام روزبخیر\nچرا دسترسی به فایل حسابهای سالها...,18
74,سلام\nمیخواستم تراکنش یک سال گذشته رو دریافت ک...,18


In [59]:
print("Sentence Encoder Only")
samples_df[only_encoder_true.tolist()]

Sentence Encoder Only


Unnamed: 0,sample,gt_idx
19,سلام \nشماره همراهی که به بانک معرفی کرده بودن...,5
22,سلام وقت بخیر. میخام پول کارت ب کارت کنم مینوی...,5
25,سلام لطفا ج بدید، چطوری میتونم بدون مراجعه ب ب...,5
26,چگونه شماره موبایل خود را تایید کنم,5
35,می خواهم کارتم را بسوزانم؟,7
45,سلام وقت بخیر \nبرای فعالسازی رمز دوم راهنمایی...,9
50,چرا با اینکه امتیازم 2/800/000 بیشتر از پنج می...,10
51,باسلام، من در باشگاه مشتریان دارای امتیاز دومی...,10
68,سلام رمز اول خود را فراموش کردم چطور میشه باز ...,15
