In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
print(torch.__version__)

# qwen2-7B: Qwen/Qwen2-7B
# qwen2-7B-Instruct: Qwen/Qwen2-7B-Instruct
# qwen3-4B: Qwen/Qwen3-4B
# Llama: meta-llama/Meta-Llama-3-8B
# baichuan: baichuan-inc/Baichuan2-13B
# yi: 01ai/Yi-1.5-6B
# yi: 01ai/Yi-6B

model_path = 'Qwen/Qwen3-4B'
model_name = 'Qwen3-4B'

# model_path = '01ai/Yi-1.5-34B-Chat'
# model_name = 'Yi-1.5-34B-Chat'

# model_path = 'baichuan-inc/Baichuan2-13B-Base'
# model_name = 'Baichuan2-13B'

start_time = time.time()

model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    torch_dtype=torch.float16,
    trust_remote_code=True,  
    attn_implementation="flash_attention_2",
    device_map="auto",
    low_cpu_mem_usage=True
)

model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, padding_side='left')

end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f} seconds")
print(f'Model name is {model_name}')

In [2]:
import os
import subprocess

os.system('nvidia-smi')
def memory_display():
    output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits", shell=True)
    memory_usage = output.decode('utf-8').strip().split('\n')
    total_memory = sum(int(mem) for mem in memory_usage)/1024
    print(f"Total GPU Memory Used: {total_memory: .2f} GB") 
    return

memory_display()

In [16]:
def split_list_into_batches(lst, batch_size):
    """split data in batch"""
    return [lst[i:i + batch_size] for i in range(0, len(lst), batch_size)]

In [3]:
# Overall Evaluation
import os
import json
import warnings

# few-shot
few_shot = True
# few_shot = ''

fs_prompt = ''
# fs_prompt = '模仿#示例1#和#示例2#，给出#示例3#的答案。'
fs_1 = "*句子*：快速加热（功能）对于___（品类）很重要\n*选项*：['微波炉', '手电筒', '电风扇', '音响']\n*答案*：微波炉"
fs_2 = "*句子*：保暖（功能）对于___（品类）很重要\n*选项*：['短袖衫', '羽绒服', '拖鞋', '太阳帽']\n*答案*：羽绒服"
suffix = "选择*选项*中相对来说最合适的一个作为*答案*填入空格处来补全以下*句子*，使其符合电子商务和生活消费领域中的逻辑和知识，输出选择的*选项*即可。括号内是词语的标签。"

# if 'Qwen' in model_name:
#     new_eos_token_id = 151645 # <|im_end|>
# #     new_eos_token_id = 151643 # <|endoftext|>
#     tokenizer.eos_token_id = new_eos_token_id

input_path = './Datasets/ECKGBench.jsonl'

system_prompt = '直接输出答案。'


if few_shot:
    output_path = f'./outputs/{model_name}_fs'
else:
    output_path = f'./outputs/{model_name}'

# print(output_path)
f = open(input_path)
f_out = open(output_path, 'w')

total_time = 0
batch_size = 100
batches = split_list_into_batches(f.readlines(), batch_size)
acc_1, acc_2, num_1, num_2 = 0,0,0,0
for i, batch in enumerate(batches):
    prompts = []
    ids = []
    gts = []
    dims = []
    choices = []
    print('-'*100)
    print('Num batch %s'%i)
    # record batch start time
    start_time = time.time()
    for line_i, line in enumerate(batch):
        cur_dict = json.loads(line)
        uniq_id = cur_dict['id']                      
        question, gt, dim = cur_dict['question'],cur_dict['gt'],cur_dict['dim']
        choice = question.split('*选项*：')[1]
        
        if few_shot == True:
            _, example = cur_dict['question'].split('\n*句子*：') 
            question  = suffix +fs_prompt +'\n#示例1#\n'+fs_1+'\n#示例2#\n'+fs_2 +'\n#示例3#\n*句子*：'+example+'\n*答案*：'   

        if 'instruct' or 'chat' in model_name.lower():
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": question}
            ]
            question = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

        ids.append(uniq_id)
        prompts.append(question)
        gts.append(gt)
        dims.append(dim)  
        choices.append(choice)
        
    # model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
    model_inputs = tokenizer(prompts, return_tensors="pt",padding=True).to("cuda")
    
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=8,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        early_stopping=True
    )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    batch_response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
#     print(batch_response)
    
    torch.cuda.empty_cache()
    
    for uniq_id, response, gt, dim, choice in zip(ids,batch_response,gts,dims,choices):
        response = response.split('\n\n')[0] if '\n\n' in response else response
        correct = gt in response and choice not in response
#         correct = gt in response
        if choice in response: 
            print(response)
            print(correct)

        if dim == 'dim_1':
            acc_1 += correct
            num_1 += 1
        else:
            acc_2 += correct
            num_2 += 1
        output_info = {"uniq_id": uniq_id, "content": response,"gt":gt}
        json.dump(output_info, f_out, ensure_ascii=False)
        f_out.write('\n')
        
    # record batch finish time     
    end_time = time.time()
    total_time += end_time - start_time
    print(f"Batch finished in {end_time - start_time:.2f} seconds")
    torch.cuda.empty_cache()    

print(f"finished in {total_time:.2f} seconds")
memory_display()
print('num_1 = ', num_1)
print('acc_1 = ', round(100*acc_1/num_1,2))
print('num_2 = ', num_2)
print('acc_2 = ', round(100*acc_2/num_2,2))
print('num = ', num_1+num_2)
print('acc = ', round(100*(acc_1+acc_2)/(num_1+num_2),2))
f_out.close()
f.close()

In [4]:
# Knowlegde Boundary

import json
import warnings

# # few-shot
# fs_prompt = '模仿#示例1#和#示例2#，给出#示例3#的答案。'
# fs_1 = "*句子*：快速加热（功能）对于___（品类）很重要\n*选项*：['微波炉', '手电筒', '电风扇', '音响']\n*答案*：微波炉"
# fs_2 = "*句子*：保暖（功能）对于___（品类）很重要\n*选项*：['短袖衫', '羽绒服', '拖鞋', '太阳帽']\n*答案*：羽绒服"
# suffix = "选择*选项*中相对来说最合适的一个作为*答案*填入空格处来补全以下*句子*，使其符合电子商务和生活消费领域中的逻辑和知识，输出选择的*选项*即可。括号内是词语的标签。"

# if 'Qwen' in model_name:
#     new_eos_token_id = 151645 # <|im_end|>
# #     new_eos_token_id = 151643 # <|endoftext|>
#     tokenizer.eos_token_id = new_eos_token_id

input_path = './Datasets/ECKGBench.jsonl'
system_prompt = '直接输出答案。'

output_path = f'./outputs/{model_name}_sample'

# print(output_path)
f = open(input_path)
f_out = open(output_path, 'w')

width = 200
num_return_sequences = 5
batch_size = int(width/num_return_sequences)

batches = split_list_into_batches(f.readlines(), batch_size)

hit_1, hit_2, recall_1, recall_2, corr_1, corr_2, num_1, num_2 = 0,0,0,0,0,0,0,0
highlyknown, maybeknown, unknown = 0,0,0

for i, batch in enumerate(batches):
    prompts = []
    ids = []
    gts = []
    dims = []
    choices = []
    print('-'*100)
    print('Num batch %s'%i)
    # record batch start time
    start_time = time.time()
    for line_i, line in enumerate(batch):
        cur_dict = json.loads(line)
        uniq_id = cur_dict['id']                      
        question, gt, dim = cur_dict['question'],cur_dict['gt'],cur_dict['dim']
        choice = question.split('*选项*：')[1]
         
        ids.append(uniq_id)
        prompts.append(question)
        gts.append(gt)
        dims.append(dim)  
        choices.append(choice)
        
    model_inputs = tokenizer(prompts, return_tensors="pt",padding=True).to("cuda")
    
    generated_ids = model.generate(
        model_inputs.input_ids, 
        max_new_tokens=8, 
#         repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,   # 设置 EOS Token ID
        do_sample=True,  # 启用采样方法
#         top_p=0.95,      # 设置 top-p 参数
        temperature = 0.2,
        num_return_sequences=num_return_sequences,
    )
#     print(len(generated_ids))
    generated_ids_batch = split_list_into_batches(generated_ids, num_return_sequences)
    batch_response = []
    for input_ids, generated_ids in zip(model_inputs.input_ids,generated_ids_batch):
        generated_ids = [
            output_ids[len(input_ids):] for output_ids in generated_ids
        ]
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        batch_response.append(response)
#     print(batch_response)
    
    torch.cuda.empty_cache()
    
    for uniq_id, response, gt, dim, choice in zip(ids,batch_response,gts,dims,choices):
        # hit rate
        hit = sum(gt in re and choice not in re for re in response)/len(response)
        # recall
        recall = any(gt in re and choice not in re for re in response)
        # strict correct
        corr = all(gt in re and choice not in re for re in response)
        
        if corr: highlyknown += 1
        elif (not corr) and recall: maybeknown += 1
        elif not recall: unknown += 1

        if dim == 'dim_1':
            hit_1 += hit
            recall_1 += recall
            corr_1 += corr
            num_1 += 1
        else:
            hit_2 += hit
            recall_2 += recall
            corr_2 += corr
            num_2 += 1
        output_info = {"uniq_id": uniq_id, "content": response,"gt":gt}
        json.dump(output_info, f_out, ensure_ascii=False)
        f_out.write('\n')
        
    # record batch finish time
    end_time = time.time()
    print(f"Batch finished in {end_time - start_time:.2f} seconds")
    torch.cuda.empty_cache()    

print('HR_1 = ', round(100*hit_1/num_1,2))
print('HR_2 = ', round(100*hit_2/num_2,2))
print('HR = ', round(100*(hit_1+hit_2)/(num_1+num_2),2))

print('Recall_1 = ', round(100*recall_1/num_1,2))
print('Recall_2 = ', round(100*recall_2/num_2,2))
print('Recall = ', round(100*(recall_1+recall_2)/(num_1+num_2),2))

print('Corr_1 = ', round(100*corr_1/num_1,2))
print('Corr_2 = ', round(100*corr_2/num_2,2))
print('Corr = ', round(100*(corr_1+corr_2)/(num_1+num_2),2))


print('know_level_1 = ', round(100*highlyknown/(num_1+num_2),2))
print('know_level_2 = ', round(100*maybeknown/(num_1+num_2),2))
print('know_level_3 = ', round(100*(unknown)/(num_1+num_2),2))

f_out.close()
f.close()