Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Sync] sync with internal implements #488

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .pre-commit-config-zh-cn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ exclude: |
tests/data/|
opencompass/models/internal/|
opencompass/utils/internal/|
opencompass/openicl/icl_evaluator/hf_metrics/
opencompass/openicl/icl_evaluator/hf_metrics/|
opencompass/datasets/lawbench/utils|
opencompass/datasets/lawbench/evaluation_functions/
)
repos:
- repo: https://gitee.com/openmmlab/mirrors-flake8
Expand Down
38 changes: 19 additions & 19 deletions opencompass/datasets/lawbench/evaluation_functions/cjft.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from ..utils.function_utils import compute_rouge
#情景法条识别
def compute_cjft(data_dict):
"""
Compute the ROUGE-L score between the prediction and the reference
"""
references, predictions = [], []
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
predictions.append(prediction)
references.append(answer)
# compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l}
from ..utils.function_utils import compute_rouge

#情景法条识别

def compute_cjft(data_dict):
"""
Compute the ROUGE-L score between the prediction and the reference
"""
references, predictions = [], []
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
predictions.append(prediction)
references.append(answer)

# compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l}
36 changes: 18 additions & 18 deletions opencompass/datasets/lawbench/evaluation_functions/flzx.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from ..utils.function_utils import compute_rouge
#法律咨询
def compute_flzx(data_dict):
"""
Compute the ROUGE-L score between the prediction and the reference
"""
references, predictions = [], []
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
predictions.append(prediction)
references.append(answer)
# compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l}
from ..utils.function_utils import compute_rouge

#法律咨询
def compute_flzx(data_dict):
"""
Compute the ROUGE-L score between the prediction and the reference
"""
references, predictions = [], []
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
predictions.append(prediction)
references.append(answer)

# compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l}
38 changes: 19 additions & 19 deletions opencompass/datasets/lawbench/evaluation_functions/ftcs.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from ..utils.function_utils import compute_rouge
#法条记忆问答
def compute_ftcs(data_dict):
"""
Compute the ROUGE-L score between the prediction and the reference
"""
references, predictions = [], []
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
answer = answer.replace("答案:", "")
predictions.append(prediction)
references.append(answer)
# compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l}
from ..utils.function_utils import compute_rouge

#法条记忆问答
def compute_ftcs(data_dict):
"""
Compute the ROUGE-L score between the prediction and the reference
"""
references, predictions = [], []
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
answer = answer.replace("答案:", "")
predictions.append(prediction)
references.append(answer)

# compute the accuracy of score_list
rouge_scores = compute_rouge(predictions, references)
rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
average_rouge_l = sum(rouge_ls) / len(rouge_ls)
return {"score": average_rouge_l}
72 changes: 36 additions & 36 deletions opencompass/datasets/lawbench/evaluation_functions/jdzy.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,36 @@
from ..utils.function_utils import multi_choice_judge
"""
multi-choice single-label selection
metric: accuracy
争议焦点:识别案件涉及的争议焦点
"""
def compute_jdzy(data_dict):
"""
Compute the Accuracy
The JEC dataset has 16 possible answers for each question, stored in the option_list
A prediction is correct if
1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction.
"""
score_list, abstentions = [], 0
option_list = ["诉讼主体", "租金情况", "利息", "本金争议", "责任认定", "责任划分", "损失认定及处理",
"原审判决是否适当", "合同效力", "财产分割", "责任承担", "鉴定结论采信问题", "诉讼时效", "违约", "合同解除", "肇事逃逸"]
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
if answer[7:-1] == "赔偿":
# todo: dataset imperfection
continue
assert answer.startswith("争议焦点类别:") and answer[7:-1] in option_list, \
f"answer: {answer} \n question: {question}"
answer_letter = answer[7:-1]
judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"])
abstentions += judge["abstention"]
# compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
from ..utils.function_utils import multi_choice_judge

"""
multi-choice single-label selection
metric: accuracy
争议焦点:识别案件涉及的争议焦点
"""

def compute_jdzy(data_dict):
"""
Compute the Accuracy
The JEC dataset has 16 possible answers for each question, stored in the option_list
A prediction is correct if
1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction.
"""

score_list, abstentions = [], 0
option_list = ["诉讼主体", "租金情况", "利息", "本金争议", "责任认定", "责任划分", "损失认定及处理",
"原审判决是否适当", "合同效力", "财产分割", "责任承担", "鉴定结论采信问题", "诉讼时效", "违约", "合同解除", "肇事逃逸"]
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
if answer[7:-1] == "赔偿":
# todo: dataset imperfection
continue
assert answer.startswith("争议焦点类别:") and answer[7:-1] in option_list, \
f"answer: {answer} \n question: {question}"

answer_letter = answer[7:-1]
judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"])
abstentions += judge["abstention"]

# compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
58 changes: 29 additions & 29 deletions opencompass/datasets/lawbench/evaluation_functions/jec_ac.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
from ..utils.function_utils import multi_choice_judge
"""
Task: multi-choice selection
Metric: Accuracy
司法考试-案例分析
"""
def compute_jec_ac(data_dict):
"""
Compute the Accuracy
The JEC dataset has 4 options for each question: A, B, C, D
A prediction is correct if
1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction.
"""
score_list, abstentions = [], 0
option_list = ["A", "B", "C", "D"]
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"
answer_letter = answer[5]
judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"])
abstentions += judge["abstention"]
# compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
from ..utils.function_utils import multi_choice_judge

"""
Task: multi-choice selection
Metric: Accuracy
司法考试-案例分析
"""
def compute_jec_ac(data_dict):
"""
Compute the Accuracy
The JEC dataset has 4 options for each question: A, B, C, D
A prediction is correct if
1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction.
"""
score_list, abstentions = [], 0
option_list = ["A", "B", "C", "D"]
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"

answer_letter = answer[5]
judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"])
abstentions += judge["abstention"]

# compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
58 changes: 29 additions & 29 deletions opencompass/datasets/lawbench/evaluation_functions/jec_kd.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
from ..utils.function_utils import multi_choice_judge
"""
Task: multi-choice selection
Metric: Accuracy
司法考试
"""
def compute_jec_kd(data_dict):
"""
Compute the Accuracy
The JEC_KD dataset has 4 options for each question: A, B, C, D
A prediction is correct if
1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction.
"""
score_list, abstentions = [], 0
option_list = ["A", "B", "C", "D"]
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"
answer_letter = answer[5]
judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"])
abstentions += judge["abstention"]
# compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
from ..utils.function_utils import multi_choice_judge

"""
Task: multi-choice selection
Metric: Accuracy
司法考试
"""
def compute_jec_kd(data_dict):
"""
Compute the Accuracy
The JEC_KD dataset has 4 options for each question: A, B, C, D
A prediction is correct if
1. The correct answer appears in the prediction, and
2. Options other than the answer do not appear in the prediction.
"""
score_list, abstentions = [], 0
option_list = ["A", "B", "C", "D"]
for example in data_dict:
question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"

answer_letter = answer[5]
judge = multi_choice_judge(prediction, option_list, answer_letter)
score_list.append(judge["score"])
abstentions += judge["abstention"]

# compute the accuracy of score_list
accuracy = sum(score_list) / len(score_list)
return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
Loading