In [None]:
import requests
import json
from tqdm import tqdm
from transformers import  AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from openai import OpenAI
import time
from http import HTTPStatus
import dashscope

url = "http://localhost:4399/evaluate"  # 如果是在不同机器上，替换为服务器IP

def request_evaluate(query, content):
    data = {
        "query": query,
        "content": content
    }

    headers = {
        "Content-Type": "application/json"
    }

    response = requests.post(url, data=json.dumps(data), headers=headers)
    try:
        resoning = response.json()['reasoning']
        ans = response.json()['ans']
        score = response.json()['score']
    except:
        print(response.text)

    return(resoning, ans, score)


class chat_local_model():
    def __init__(self, model_name, generate_cuda="cuda:0"):
        self.model_name = model_name
        
        # 原始FP16/FP32模型加载方式
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype="auto",
            trust_remote_code=True,
            device_map="auto"
        )
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    def chat_transformers_model(self,content, system_content="你是一个有用的助手"):
        messages = [
            {"role": "system", "content": system_content},
            {"role": "user", "content": content}
        ]
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)

        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=1024
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return response
    


def chat_qwen(content, system_content = "你是一个有用的助手!", model_name = "qwen2.5-14b-instruct"):
    #print(model_name)
    client = OpenAI(
    # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
    api_key="", 
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    )
    #time.sleep(1)
    completion = client.chat.completions.create(
        model=model_name, # 此处以qwen-plus为例，可按需更换模型名称。模型列表：https://help.aliyun.com/zh/model-studio/getting-started/models
        messages=[
            {'role': 'system', 'content': system_content},
            {'role': 'user', 'content': content}],
    )
    
    content = completion.choices[0].message.content
    return content



def chat_volces(content, syetem_content = "你是一个有用的助手!", model_name = "doubao-1-5-lite-32k-250115"):
    url = 'https://ark.cn-beijing.volces.com/api/v3/chat/completions'
    headers = {
        'Authorization': "",
        'Content-Type': 'application/json'  # 添加这个header确保服务器知道我们发送的是JSON数据。
    }
    data = {
        "model": model_name,
        "messages": [
            {
                "role": "system",
                "content": syetem_content
            },
            {
                "role": "user",
                "content": content
            }
        ],
        "stream": False
    }

    response = requests.post(url, headers=headers, json=data)
    ans = response.json()['choices'][0]['message']['content']
    return(ans)

def chat_deepseek(content, syetem_content = "你是一个有用的助手!", model_name = "deepseek-chat"):
    client = OpenAI(api_key="", base_url="https://api.deepseek.com")

    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": syetem_content},
            {"role": "user", "content": content},
        ],
        stream=False  # 启用流式传输
    )

    return(response.choices[0].message.content)

def chat_qianfan(content, system_content="你是一个有用的助手", model_name="ernie-x1-32k-preview"):
    client = OpenAI(
        base_url='https://qianfan.baidubce.com/v2',
        api_key=''
    )
    response = client.chat.completions.create(
        model=model_name, 
        messages=[
        {
            "role": "system",
            "content": system_content
        },
        {
            "role": "user",
            "content": content
        }
        ],
        extra_body={ 
            "web_search":{
        "enable": False,
        "enable_citation": False,
        "enable_trace": False
        }
        }
    )
    return (response.choices[0].message.content)

def chat_xiaoai(content, system_content = "你是一个有用的助手!", model_name = "gpt-3.5-turbo"):
    #print(model_name)
    client = OpenAI(
    # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
    api_key="", 
    base_url="https://pro.xiaoai.plus/v1",
    )
    #time.sleep(1)
    completion = client.chat.completions.create(
        model=model_name, # 此处以qwen-plus为例，可按需更换模型名称。模型列表：https://help.aliyun.com/zh/model-studio/getting-started/models
        messages=[
            {'role': 'system', 'content': system_content},
            {'role': 'user', 'content': content}],
    )
    
    content = completion.choices[0].message.content
    return content

def chat_dashscope(content, system_content="你是一个有用的助手", model_name='llama3.3-70b-instruct'):
    messages = [{'role': 'system', 'content': system_content},
                {'role': 'user', 'content': content}]
    
    max_retries = 3
    retry_delay = 3  # 初始延迟1秒
    
    for attempt in range(max_retries):
        try:
            response = dashscope.Generation.call(
                api_key="",
                model=model_name,
                messages=messages,
                result_format='message',
            )
            
            if response.status_code == HTTPStatus.OK:
                return response.output.choices[0].message.content
            else:
                raise Exception(f"API request failed with status code: {response.status_code}")
                
        except Exception as e:
            if attempt == max_retries - 1:  # 最后一次尝试也失败了
                raise Exception(f"Failed after {max_retries} attempts. Last error: {str(e)}")
            
            print(f"Attempt {attempt + 1} failed. Retrying in {retry_delay} seconds... Error: {str(e)}")
            time.sleep(retry_delay)
            retry_delay *= 2  # 指数退避，每次延迟时间加倍

def chat_qwen3(content, system_content="你是一个有用的助手", model_name="qwen3-30b-a3b"):
    #print(f"启动{model_name}")
        # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
    api_key=""

    client = OpenAI(
        # 若没有配置环境变量，请用阿里云百炼API Key将下行替换为：api_key="sk-xxx",
        api_key=api_key,
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
    )

    completion = client.chat.completions.create(
        model=model_name,  # 此处以qwen-plus为例，您可按需更换模型名称。模型列表：https://help.aliyun.com/zh/model-studio/getting-started/models
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": content}
        ],
        stream=True,
        # Qwen3模型通过enable_thinking参数控制思考过程（开源版默认True，商业版默认False）
        # 使用Qwen3开源版模型时，请将下行取消注释，否则会报错
        extra_body={"enable_thinking": False},
    )

    full_content = ""
    #print("流式输出内容为：")
    for chunk in completion:
        # 如果stream_options.include_usage为True，则最后一个chunk的choices字段为空列表，需要跳过（可以通过chunk.usage获取 Token 使用量）
        if chunk.choices:
            full_content += chunk.choices[0].delta.content
            #print(chunk.choices[0].delta.content)
    return full_content

def chat_manager(query, model_type, model_name, test_model):
    if model_type == "local_request":
        ans = test_model.chat_transformers_model(query)
    elif model_type == "qwen_api":
        ans = chat_qwen(query,model_name=model_name)
    elif model_type == "volces_api":
        ans = chat_volces(query, model_name=model_name)
    elif model_type == "deepseek_api":
        ans = chat_deepseek(query, model_name=model_name)
    elif model_type == "qianfan_api":
        ans = chat_qianfan(query, model_name=model_name)
    elif model_type == "xiaoai_api":
        ans = chat_xiaoai(query, model_name=model_name)
    elif model_type == "dashscope_api":
        ans = chat_dashscope(query, model_name=model_name)
    elif model_type == "qwen3_api":
        ans = chat_qwen3(query, model_name=model_name)
    else:
        print(f"model_type error, {model_type}")
    return ans

with open('../data/evaluate_data/random_sampled_query_evaluation_60.json', 'r') as f:
    eval_data = json.load(f)
#eval_data = eval_data_addition
print(f"开始评估, 评估数据集长度为 {len(eval_data)}")

开始评估, 评估数据集长度为 60


In [None]:
def main_evaluation(key_word,model_type, model_path):
    ## 测试结果保存目录
    result_file_path ="../data/result/generated_eval_data_test60.json"
    if isinstance(result_file_path):
        with open(result_file_path, 'r') as f:
            generate_eval_data_dict = json.load(f)

    print(f"开始评估 {model_path.split('/')[-1]}, 访问方式:{model_type}, 保存key_word: {key_word}")
    if key_word in generate_eval_data_dict:
        #print(key_word,len(generate_eval_data_dict[key_word]['score']),len(eval_data))
        if len(generate_eval_data_dict[key_word]['score']) == len(eval_data):
        
            print("数据已存在, 跳过")
            return None

    if model_type == "local_request":
        test_model = chat_local_model(model_path)
    else:
        test_model = None


    reasoning_list = []
    ans_list = []
    score_list = []
    content_list = []



    for i in tqdm(range(len(eval_data))):
        query = eval_data[i]['query']
        content = chat_manager(query, model_type, model_name = model_path, test_model = test_model)
        resoning, ans, score = request_evaluate(query,content)
        reasoning_list.append(resoning)
        ans_list.append(ans)
        score_list.append(score)
        content_list.append(content)
    
    if key_word not in generate_eval_data_dict:
        generate_eval_data_dict[key_word] = {}


    df = pd.DataFrame(score_list, columns=["values"])
    summary = df.describe(percentiles=[0.25, 0.5, 0.75])  # 计算 25%、50%、75% 分位数
    print(summary)
    generate_eval_data_dict[key_word] = {
        "content" : content_list,
        "reasoning" : reasoning_list,
        "ans" : ans_list,
        "score" : score_list
    }
    with open("/data/home/shizeyang/lyh_workzone/LLM_Scores/V5/evaluation_data/generate_eval_data_test60.json", 'w',encoding='utf-8') as f:
        json.dump(generate_eval_data_dict, f, ensure_ascii=False, indent=4)
    print("数据已保存!")

    if test_model is not None:
        del test_model
        print("已经清除模型缓存")


# 要测试的模型配置
model_configs = [
    ("qwen_05B_int4","local_request", "/data/home/shizeyang/shared/models/Qwen/Qwen2___5-0___5B-Instruct-GPTQ-Int4"),
    ("qwen_05B_int8","local_request", "/data/home/shizeyang/shared/models/Qwen/Qwen2___5-0___5B-Instruct-GPTQ-Int8"),
    ("qwen_05B","local_request", "/data/home/shizeyang/shared/models/Qwen/Qwen2___5-0___5B-Instruct"),
    ("qwen_15B","local_request", "/data/home/shizeyang/shared/models/Qwen/Qwen2___5-1___5B-Instruct"),
    ("qwen_3B","local_request", "/data/home/shizeyang/shared/models/Qwen/Qwen2___5-3B-Instruct"),
    ("qwen_7B","local_request", "/data/home/shizeyang/shared/models/Qwen/Qwen2___5-7B-Instruct"),
    ("qwen_7B_int4","local_request", "/data/home/shizeyang/shared/models/Qwen/Qwen2___5-7B-Instruct-GPTQ-Int4"),
    ("qwen_7B_int8","local_request", "/data/home/shizeyang/shared/models/Qwen/Qwen2___5-7B-Instruct-GPTQ-Int8"),
    ("qwen_math_15B","local_request", '/data/home/shizeyang/shared/models/Qwen/Qwen2.5-Math-1.5B-Instruct'),
    ("qwen_math_7B","local_request", '/data/home/shizeyang/shared/models/Qwen/Qwen2.5-Math-7B-Instruct'),
    ("MiniMind2_small","local_request", "/data/home/shizeyang/shared/models/MiniMind2-Small"),
    ("qwen_14B","qwen_api", "qwen2.5-14b-instruct"),
    ("qwen_32B","qwen_api", "qwen2.5-32b-instruct"),
    ("qwen_72B","qwen_api", "qwen2.5-72b-instruct"),
    ("qwen-max","qwen_api", "qwen-max"),
    ("qwen-plus","qwen_api", "qwen-plus"),
    ("qwen-turbo","qwen_api", "qwen-turbo"),
    ("llama4_scout_17B","qwen_api", "llama-4-scout-17b-16e-instruct"),
    ("llama4_maverick-17B","qwen_api", "llama-4-maverick-17b-128e-instruct"),
    ("llama33_70B","qwen_api", "llama3.3-70b-instruct"),
    ("deepseek_v3","qwen_api", "deepseek-v3"),
    ("volces_lite","volces_api", "doubao-1-5-lite-32k-250115"),
    ("volces_pro","volces_api", "doubao-1-5-pro-32k-250115"),
    ("kimi","volces_api", "moonshot-v1-8k"),
    ("kimi_32k","volces_api", "moonshot-v1-32k"),
    ("kimi_128k","volces_api", "moonshot-v1-128k"),
    ("deepseek_v3_0324","deepseek_api", "deepseek-chat"),
    ("deepseek_r1","deepseek_api", "deepseek-reasoner"),
    ("ERNIE_X1","qianfan_api", "ernie-x1-32k-preview"),
    ("ERNIE_45","qianfan_api", "ernie-4.5-8k-preview"),
    ("ERNIE_40","qianfan_api", "ernie-4.0-8k"),
    ("ERNIE_speed","qianfan_api", "ernie-speed-8k"),
    ("Qwen_ds_distill_15B","qianfan_api", "deepseek-r1-distill-qwen-1.5b"),
    ("Qwen_ds_distill_7B","qianfan_api", "deepseek-r1-distill-qwen-7b"),
    ("Qwen_ds_distill_14B","qianfan_api", "deepseek-r1-distill-qwen-14b"),
    ("Qwen_ds_distill_32B","qianfan_api", "deepseek-r1-distill-qwen-32b"),
    ("Qwen_qwq_32B","qianfan_api", "qwq-32b"),
    ("Chatgpt_35_turbo","xiaoai_api", "gpt-3.5-turbo"),
    ("Chatgpt_4o","xiaoai_api", "gpt-4o"),
    ("llama32_1B","dashscope_api", "llama3.2-1b-instruct"),
    ("llama32_3B","dashscope_api", "llama3.2-3b-instruct"),
    ("Qwen_farui","dashscope_api", "farui-plus"),
    ("qwen3-0.6b","qwen3_api","qwen3-0.6b"),
    ("qwen3-235b-a22b","qwen3_api","qwen3-235b-a22b"),
    ("qwen3-32b","qwen3_api","qwen3-32b"),
    ("qwen3-30b-a3b","qwen3_api","qwen3-30b-a3b"),
    ("qwen3-14b","qwen3_api","qwen3-14b"),
    ("qwen3-4b","qwen3_api","qwen3-4b"),
    ("qwen3-1.7b","qwen3_api","qwen3-1.7b")
]

for config in model_configs:
    key_word,model_type, model_path = config
    model = main_evaluation(key_word,model_type, model_path)
    #print(f"已清理 {key_word} 模型的显存占用")

开始评估 Qwen2___5-0___5B-Instruct-GPTQ-Int4, 访问方式:local_request, 保存key_word: qwen_05B_int4
数据已存在, 跳过
开始评估 Qwen2___5-0___5B-Instruct-GPTQ-Int8, 访问方式:local_request, 保存key_word: qwen_05B_int8
数据已存在, 跳过
开始评估 Qwen2___5-0___5B-Instruct, 访问方式:local_request, 保存key_word: qwen_05B
数据已存在, 跳过
开始评估 Qwen2___5-1___5B-Instruct, 访问方式:local_request, 保存key_word: qwen_15B
数据已存在, 跳过
开始评估 Qwen2___5-3B-Instruct, 访问方式:local_request, 保存key_word: qwen_3B
数据已存在, 跳过
开始评估 Qwen2___5-7B-Instruct, 访问方式:local_request, 保存key_word: qwen_7B
数据已存在, 跳过
开始评估 Qwen2___5-7B-Instruct-GPTQ-Int4, 访问方式:local_request, 保存key_word: qwen_7B_int4
数据已存在, 跳过
开始评估 Qwen2___5-7B-Instruct-GPTQ-Int8, 访问方式:local_request, 保存key_word: qwen_7B_int8
数据已存在, 跳过
开始评估 Qwen2.5-Math-1.5B-Instruct, 访问方式:local_request, 保存key_word: qwen_math_15B
数据已存在, 跳过
开始评估 Qwen2.5-Math-7B-Instruct, 访问方式:local_request, 保存key_word: qwen_math_7B
数据已存在, 跳过
开始评估 MiniMind2-Small, 访问方式:local_request, 保存key_word: MiniMind2_small
数据已存在, 跳过
开始评估 qwen2.5-14b-instruct, 访问方式:qwen_api,

 33%|███▎      | 20/60 [06:46<09:55, 14.89s/it]

In [3]:
with open("/data/home/shizeyang/lyh_workzone/LLM_Scores/V5/evaluation_data/generate_eval_data_test60.json", 'r') as f:
    generate_eval_data_dict = json.load(f)

import numpy as np
print("独立测试集")
for k, v in generate_eval_data_dict.items():
    scores = v['score']
    print(f"Model: {k}, Score: {np.mean(scores):.2f} ± {np.std(scores):.2f}")


独立测试集
Model: qwen_05B, Score: 55.31 ± 12.43
Model: qwen_15B, Score: 68.08 ± 11.15
Model: qwen_3B, Score: 76.83 ± 7.15
Model: qwen_7B, Score: 79.60 ± 7.60
Model: qwen_14B, Score: 80.05 ± 7.01
Model: qwen_32B, Score: 80.18 ± 8.05
Model: qwen_72B, Score: 82.29 ± 6.73
Model: volces_lite, Score: 84.84 ± 12.14
Model: volces_pro, Score: 89.59 ± 5.73
Model: deepseek_v3, Score: 87.27 ± 7.08
Model: deepseek_v3_0324, Score: 94.09 ± 0.93
Model: deepseek_r1, Score: 93.76 ± 1.23
Model: qwen-max, Score: 84.65 ± 5.72
Model: qwen-plus, Score: 89.78 ± 5.77
Model: qwen-turbo, Score: 84.59 ± 10.04
Model: llama4_scout_17B, Score: 75.32 ± 10.59
Model: llama4_maverick-17B, Score: 75.77 ± 8.91
Model: llama33_70B, Score: 69.80 ± 9.62
Model: ERNIE_X1, Score: 92.95 ± 1.12
Model: ERNIE_45, Score: 89.07 ± 4.99
Model: ERNIE_40, Score: 74.07 ± 7.72
Model: ERNIE_speed, Score: 70.10 ± 9.11
Model: Qwen_ds_distill_15B, Score: 21.69 ± 11.73
Model: Qwen_ds_distill_7B, Score: 53.43 ± 18.53
Model: Qwen_ds_distill_14B, Score

In [6]:
print(generate_eval_data_dict['qwen_05B']['content'][0])

《刑法》第一百三十七条和第一百三十八条分别规定了“重大责任事故罪”和“重大劳动安全事故罪”。这两种犯罪在性质、主体、客体等方面有显著差异。

### 一、性质

1. **重大责任事故罪**：主要针对的是生产经营单位的管理人员，特别是负责安全生产管理工作的人员。
2. **重大劳动安全事故罪**：主要针对的是直接从事生产、作业的劳动者（包括无雇工的个体工商户等）。

### 二、主体

1. **重大责任事故罪**：主体为一般主体，即任何自然人或法人组织均可构成。
2. **重大劳动安全事故罪**：主体则为特殊主体，主要是指直接从事生产、作业的劳动者，主要包括从业人员。

### 三、客体

1. **重大责任事故罪**：侵犯的是公共安全的行为，具体表现为造成重大伤亡或者其他严重后果的情形。
2. **重大劳动安全事故罪**：侵犯的是国家法律和社会秩序的行为，具体表现为造成重大伤亡或者对社会公私财产造成重大损失的情形。

### 四、危害程度

1. **重大责任事故罪**：其危害程度通常较轻，可能仅导致轻微的人身伤害或财产损失。
2. **重大劳动安全事故罪**：其危害程度相对较大，可能导致严重的人员伤亡、经济损失甚至社会影响。

### 共同点

1. **均以危害公共安全为要件**：两罪都要求行为人违反了有关安全生产的规定，导致了人身伤亡或其他严重后果。
2. **均需具备违法性**：两罪都需要行为人实施了违反法律法规的行为，但具体是否属于违法行为需要根据案件具体情况来判断。
3. **均具有主观故意**：两罪都要求行为人明知自己的行为会危及公共安全，从而放任这种危险的发生。

总的来说，《刑法》第一百三十七条和第一百三十八条虽然名称不同，但在性质、主体、客体以及危害程度上都有明显的区别，体现了法律对于保障人民生命财产安全的重要作用。
