open-compass · Leymore · Oct 19, 2023 · Oct 19, 2023
diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
@@ -3,7 +3,9 @@ exclude: |
       tests/data/|
       opencompass/models/internal/|
       opencompass/utils/internal/|
-      opencompass/openicl/icl_evaluator/hf_metrics/
+      opencompass/openicl/icl_evaluator/hf_metrics/|
+      opencompass/datasets/lawbench/utils|
+      opencompass/datasets/lawbench/evaluation_functions/
     )
 repos:
   - repo: https://gitee.com/openmmlab/mirrors-flake8

diff --git a/opencompass/datasets/lawbench/evaluation_functions/cjft.py b/opencompass/datasets/lawbench/evaluation_functions/cjft.py
@@ -1,19 +1,19 @@
-from ..utils.function_utils import compute_rouge
-
-#情景法条识别
-
-def compute_cjft(data_dict):
-    """
-    Compute the ROUGE-L score between the prediction and the reference
-    """
-    references, predictions = [], []
-    for example in data_dict:
-        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
-        predictions.append(prediction)
-        references.append(answer)
-
-    # compute the accuracy of score_list
-    rouge_scores = compute_rouge(predictions, references)
-    rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
-    average_rouge_l = sum(rouge_ls) / len(rouge_ls)
-    return {"score": average_rouge_l}
+from ..utils.function_utils import compute_rouge
+
+#情景法条识别
+
+def compute_cjft(data_dict):
+    """
+    Compute the ROUGE-L score between the prediction and the reference
+    """
+    references, predictions = [], []
+    for example in data_dict:
+        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
+        predictions.append(prediction)
+        references.append(answer)
+
+    # compute the accuracy of score_list
+    rouge_scores = compute_rouge(predictions, references)
+    rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
+    average_rouge_l = sum(rouge_ls) / len(rouge_ls)
+    return {"score": average_rouge_l}
diff --git a/opencompass/datasets/lawbench/evaluation_functions/flzx.py b/opencompass/datasets/lawbench/evaluation_functions/flzx.py
@@ -1,18 +1,18 @@
-from ..utils.function_utils import compute_rouge
-
-#法律咨询
-def compute_flzx(data_dict):
-    """
-    Compute the ROUGE-L score between the prediction and the reference
-    """
-    references, predictions = [], []
-    for example in data_dict:
-        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
-        predictions.append(prediction)
-        references.append(answer)
-
-    # compute the accuracy of score_list
-    rouge_scores = compute_rouge(predictions, references)
-    rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
-    average_rouge_l = sum(rouge_ls) / len(rouge_ls)
-    return {"score": average_rouge_l}
+from ..utils.function_utils import compute_rouge
+
+#法律咨询
+def compute_flzx(data_dict):
+    """
+    Compute the ROUGE-L score between the prediction and the reference
+    """
+    references, predictions = [], []
+    for example in data_dict:
+        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
+        predictions.append(prediction)
+        references.append(answer)
+
+    # compute the accuracy of score_list
+    rouge_scores = compute_rouge(predictions, references)
+    rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
+    average_rouge_l = sum(rouge_ls) / len(rouge_ls)
+    return {"score": average_rouge_l}
diff --git a/opencompass/datasets/lawbench/evaluation_functions/ftcs.py b/opencompass/datasets/lawbench/evaluation_functions/ftcs.py
@@ -1,19 +1,19 @@
-from ..utils.function_utils import compute_rouge
-
-#法条记忆问答
-def compute_ftcs(data_dict):
-    """
-    Compute the ROUGE-L score between the prediction and the reference
-    """
-    references, predictions = [], []
-    for example in data_dict:
-        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
-        answer = answer.replace("答案:", "")
-        predictions.append(prediction)
-        references.append(answer)
-
-    # compute the accuracy of score_list
-    rouge_scores = compute_rouge(predictions, references)
-    rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
-    average_rouge_l = sum(rouge_ls) / len(rouge_ls)
-    return {"score": average_rouge_l}
+from ..utils.function_utils import compute_rouge
+
+#法条记忆问答
+def compute_ftcs(data_dict):
+    """
+    Compute the ROUGE-L score between the prediction and the reference
+    """
+    references, predictions = [], []
+    for example in data_dict:
+        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
+        answer = answer.replace("答案:", "")
+        predictions.append(prediction)
+        references.append(answer)
+
+    # compute the accuracy of score_list
+    rouge_scores = compute_rouge(predictions, references)
+    rouge_ls = [score["rouge-l"]["f"] for score in rouge_scores]
+    average_rouge_l = sum(rouge_ls) / len(rouge_ls)
+    return {"score": average_rouge_l}
diff --git a/opencompass/datasets/lawbench/evaluation_functions/jdzy.py b/opencompass/datasets/lawbench/evaluation_functions/jdzy.py
@@ -1,36 +1,36 @@
-from ..utils.function_utils import multi_choice_judge
-
-"""
-multi-choice single-label selection
-metric: accuracy
-争议焦点：识别案件涉及的争议焦点
-"""
-
-def compute_jdzy(data_dict):
-    """
-    Compute the Accuracy
-    The JEC dataset has 16 possible answers for each question, stored in the option_list
-    A prediction is correct if
-    1. The correct answer appears in the prediction, and
-    2. Options other than the answer do not appear in the prediction.
-    """
-
-    score_list, abstentions = [], 0
-    option_list = ["诉讼主体", "租金情况", "利息", "本金争议", "责任认定", "责任划分", "损失认定及处理",
-                   "原审判决是否适当", "合同效力", "财产分割", "责任承担", "鉴定结论采信问题", "诉讼时效", "违约", "合同解除", "肇事逃逸"]
-    for example in data_dict:
-        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
-        if answer[7:-1] == "赔偿":
-            # todo: dataset imperfection
-            continue
-        assert answer.startswith("争议焦点类别：") and answer[7:-1] in option_list, \
-            f"answer: {answer} \n question: {question}"
-
-        answer_letter = answer[7:-1]
-        judge = multi_choice_judge(prediction, option_list, answer_letter)
-        score_list.append(judge["score"])
-        abstentions += judge["abstention"]
-
-    # compute the accuracy of score_list
-    accuracy = sum(score_list) / len(score_list)
-    return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
+from ..utils.function_utils import multi_choice_judge
+
+"""
+multi-choice single-label selection
+metric: accuracy
+争议焦点：识别案件涉及的争议焦点
+"""
+
+def compute_jdzy(data_dict):
+    """
+    Compute the Accuracy
+    The JEC dataset has 16 possible answers for each question, stored in the option_list
+    A prediction is correct if
+    1. The correct answer appears in the prediction, and
+    2. Options other than the answer do not appear in the prediction.
+    """
+
+    score_list, abstentions = [], 0
+    option_list = ["诉讼主体", "租金情况", "利息", "本金争议", "责任认定", "责任划分", "损失认定及处理",
+                   "原审判决是否适当", "合同效力", "财产分割", "责任承担", "鉴定结论采信问题", "诉讼时效", "违约", "合同解除", "肇事逃逸"]
+    for example in data_dict:
+        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
+        if answer[7:-1] == "赔偿":
+            # todo: dataset imperfection
+            continue
+        assert answer.startswith("争议焦点类别：") and answer[7:-1] in option_list, \
+            f"answer: {answer} \n question: {question}"
+
+        answer_letter = answer[7:-1]
+        judge = multi_choice_judge(prediction, option_list, answer_letter)
+        score_list.append(judge["score"])
+        abstentions += judge["abstention"]
+
+    # compute the accuracy of score_list
+    accuracy = sum(score_list) / len(score_list)
+    return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
diff --git a/opencompass/datasets/lawbench/evaluation_functions/jec_ac.py b/opencompass/datasets/lawbench/evaluation_functions/jec_ac.py
@@ -1,29 +1,29 @@
-from ..utils.function_utils import multi_choice_judge
-
-"""
-Task: multi-choice selection
-Metric: Accuracy
-司法考试-案例分析
-"""
-def compute_jec_ac(data_dict):
-    """
-    Compute the Accuracy
-    The JEC dataset has 4 options for each question: A, B, C, D
-    A prediction is correct if
-    1. The correct answer appears in the prediction, and
-    2. Options other than the answer do not appear in the prediction.
-    """
-    score_list, abstentions = [], 0
-    option_list = ["A", "B", "C", "D"]
-    for example in data_dict:
-        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
-        assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"
-
-        answer_letter = answer[5]
-        judge = multi_choice_judge(prediction, option_list, answer_letter)
-        score_list.append(judge["score"])
-        abstentions += judge["abstention"]
-
-    # compute the accuracy of score_list
-    accuracy = sum(score_list) / len(score_list)
-    return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
+from ..utils.function_utils import multi_choice_judge
+
+"""
+Task: multi-choice selection
+Metric: Accuracy
+司法考试-案例分析
+"""
+def compute_jec_ac(data_dict):
+    """
+    Compute the Accuracy
+    The JEC dataset has 4 options for each question: A, B, C, D
+    A prediction is correct if
+    1. The correct answer appears in the prediction, and
+    2. Options other than the answer do not appear in the prediction.
+    """
+    score_list, abstentions = [], 0
+    option_list = ["A", "B", "C", "D"]
+    for example in data_dict:
+        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
+        assert answer.startswith("正确答案:") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"
+
+        answer_letter = answer[5]
+        judge = multi_choice_judge(prediction, option_list, answer_letter)
+        score_list.append(judge["score"])
+        abstentions += judge["abstention"]
+
+    # compute the accuracy of score_list
+    accuracy = sum(score_list) / len(score_list)
+    return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
diff --git a/opencompass/datasets/lawbench/evaluation_functions/jec_kd.py b/opencompass/datasets/lawbench/evaluation_functions/jec_kd.py
@@ -1,29 +1,29 @@
-from ..utils.function_utils import multi_choice_judge
-
-"""
-Task: multi-choice selection
-Metric: Accuracy
-司法考试
-"""
-def compute_jec_kd(data_dict):
-    """
-    Compute the Accuracy
-    The JEC_KD dataset has 4 options for each question: A, B, C, D
-    A prediction is correct if
-    1. The correct answer appears in the prediction, and
-    2. Options other than the answer do not appear in the prediction.
-    """
-    score_list, abstentions = [], 0
-    option_list = ["A", "B", "C", "D"]
-    for example in data_dict:
-        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
-        assert answer.startswith("正确答案：") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"
-
-        answer_letter = answer[5]
-        judge = multi_choice_judge(prediction, option_list, answer_letter)
-        score_list.append(judge["score"])
-        abstentions += judge["abstention"]
-
-    # compute the accuracy of score_list
-    accuracy = sum(score_list) / len(score_list)
-    return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}
+from ..utils.function_utils import multi_choice_judge
+
+"""
+Task: multi-choice selection
+Metric: Accuracy
+司法考试
+"""
+def compute_jec_kd(data_dict):
+    """
+    Compute the Accuracy
+    The JEC_KD dataset has 4 options for each question: A, B, C, D
+    A prediction is correct if
+    1. The correct answer appears in the prediction, and
+    2. Options other than the answer do not appear in the prediction.
+    """
+    score_list, abstentions = [], 0
+    option_list = ["A", "B", "C", "D"]
+    for example in data_dict:
+        question, prediction, answer = example["origin_prompt"], example["prediction"], example["refr"]
+        assert answer.startswith("正确答案：") and answer[5] in option_list, f"answer[5]: {answer}, question: {question}"
+
+        answer_letter = answer[5]
+        judge = multi_choice_judge(prediction, option_list, answer_letter)
+        score_list.append(judge["score"])
+        abstentions += judge["abstention"]
+
+    # compute the accuracy of score_list
+    accuracy = sum(score_list) / len(score_list)
+    return {"score": accuracy, "abstention_rate": abstentions / len(data_dict)}