In [3]:
import os
import json
import argparse
import numpy as np
import pandas as pd

from metrics import (
    qa_f1_score,
    rouge_zh_score,
    qa_f1_zh_score,
    rouge_score,
    classification_score,
    retrieval_score,
    retrieval_zh_score,
    count_score,
    code_sim_score,
)


dataset2metric = {
    "narrativeqa": qa_f1_score,
    "qasper": qa_f1_score,
    "multifieldqa_en": qa_f1_score,
    "multifieldqa_zh": qa_f1_zh_score,
    "hotpotqa": qa_f1_score,
    "2wikimqa": qa_f1_score,
    "musique": qa_f1_score,
    "dureader": rouge_zh_score,
    "gov_report": rouge_score,
    "qmsum": rouge_score,
    "multi_news": rouge_score,
    "vcsum": rouge_zh_score,
    "trec": classification_score,
    "triviaqa": qa_f1_score,
    "samsum": rouge_score,
    "lsht": classification_score,
    "passage_retrieval_en": retrieval_score,
    "passage_count": count_score,
    "passage_retrieval_zh": retrieval_zh_score,
    "lcc": code_sim_score,
    "repobench-p": code_sim_score,
}

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--models_dir', type=str, required=True, help="Directory containing model directories")
    parser.add_argument('--output_dir', type=str, required=True, help="Directory to save evaluation results")
    return parser.parse_args()


def scorer(dataset, predictions, answers, all_classes):
    total_score = 0.
    for (prediction, ground_truths) in zip(predictions, answers):
        score = 0.
        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
            prediction = prediction.lstrip('\n').split('\n')[0]
        for ground_truth in ground_truths:
            score = max(score, dataset2metric[dataset](prediction.lower(), ground_truth.lower(), all_classes=all_classes))
        total_score += score
    return round(100 * total_score / len(predictions), 2)

def eval(path):
    scores = dict()
    if not path.endswith('/'):
        path += '/'
    
    all_files = os.listdir(path)
    print("Evaluating on:", all_files)

    for filename in all_files:
        if not filename.endswith("jsonl"):
            continue
        
        predictions, answers, lengths, all_classes = [], [], [], []
        dataset = filename.split('.')[0]
        
        with open(f"{path}{filename}", "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                if "context_length" in data and data["context_length"] < 35000:
                    predictions.append(data["pred"])
                    answers.append(data["answers"])
                    if "all_classes" in data:
                        all_classes = data["all_classes"]
                    if "length" in data:
                        lengths.append(data["length"])
                elif data["length"] <= 20000:
                    predictions.append(data["pred"])
                    answers.append(data["answers"])
                    if "all_classes" in data:
                        all_classes = data["all_classes"]
                    if "length" in data:
                        lengths.append(data["length"])

        score = scorer(dataset, predictions, answers, all_classes)
        scores[dataset] = score
        print(f"{dataset}: {score}")
    
    return scores





In [5]:
# 设定模型目录和输出目录
models_dir = "/home/yuhao/FILM/real_world_long/LongBench/pred/"  # 替换为你的模型目录路径
output_dir = "/home/yuhao/FILM/real_world_long/LongBench/pred/"  # 替换为你的输出目录路径

results = {}
model_dirs = [os.path.join(models_dir, model_dir) for model_dir in os.listdir(models_dir) if os.path.isdir(os.path.join(models_dir, model_dir))]

for model_dir in model_dirs:
    model_name = os.path.basename(model_dir)
    print(f"Evaluating model in {model_dir}...")
    model_scores = eval(model_dir)
    results[model_name] = model_scores

# 将字典转换为 DataFrame 并填充缺失值
df = pd.DataFrame.from_dict(results, orient='index').fillna(0)

# 计算平均分数并添加为新列
df['Average Score'] = df.mean(axis=1)

# 按照平均分数从高到低排序 DataFrame
df_sorted = df.sort_values(by='Average Score', ascending=False)

# 保存结果到 CSV 文件
df_sorted.to_csv(os.path.join(output_dir, 'model_scores_sorted.csv'))
print("Sorted scores saved to CSV.")


Evaluating model in /home/yuhao/FILM/real_world_long/LongBench/pred/eval/checkpoint-500-fuyao...
Evaluating on: ['narrativeqa.jsonl', 'musique.jsonl', 'multifieldqa_en.jsonl', 'hotpotqa.jsonl', '2wikimqa.jsonl', 'qasper.jsonl']
narrativeqa: 4.7
musique: 4.38
multifieldqa_en: 15.24
hotpotqa: 9.17
2wikimqa: 9.39
qasper: 10.37
Evaluating model in /home/yuhao/FILM/real_world_long/LongBench/pred/eval/250_short_Orca...
Evaluating on: ['narrativeqa.jsonl', 'musique.jsonl', 'multifieldqa_en.jsonl', 'hotpotqa.jsonl', '2wikimqa.jsonl', 'qasper.jsonl']
narrativeqa: 3.31
musique: 3.88
multifieldqa_en: 17.99
hotpotqa: 7.48
2wikimqa: 10.65
qasper: 7.81
Evaluating model in /home/yuhao/FILM/real_world_long/LongBench/pred/eval/token_version_6.7_checkpoint-250_dolly...
Evaluating on: ['narrativeqa.jsonl', 'musique.jsonl', 'multifieldqa_en.jsonl', 'hotpotqa.jsonl', '2wikimqa.jsonl', 'qasper.jsonl']
narrativeqa: 15.55
musique: 18.28
multifieldqa_en: 28.56
hotpotqa: 27.62
2wikimqa: 25.25
qasper: 21.92
Eval

In [73]:
eval('pred/tinyllama-context-version_6_4k-pretrain-dolly','pred_e/TinyLlama/version_4')

TypeError: eval() takes 1 positional argument but 2 were given

In [1]:
import os
import json
import argparse
import numpy as np
import pandas as pd

from metrics import (
    qa_f1_score,
    rouge_zh_score,
    qa_f1_zh_score,
    rouge_score,
    classification_score,
    retrieval_score,
    retrieval_zh_score,
    count_score,
    code_sim_score,
)


dataset2metric = {
    "narrativeqa": qa_f1_score,
    "qasper": qa_f1_score,
    "multifieldqa_en": qa_f1_score,
    "multifieldqa_zh": qa_f1_zh_score,
    "hotpotqa": qa_f1_score,
    "2wikimqa": qa_f1_score,
    "musique": qa_f1_score,
    "dureader": rouge_zh_score,
    "gov_report": rouge_score,
    "qmsum": rouge_score,
    "multi_news": rouge_score,
    "vcsum": rouge_zh_score,
    "trec": classification_score,
    "triviaqa": qa_f1_score,
    "samsum": rouge_score,
    "lsht": classification_score,
    "passage_retrieval_en": retrieval_score,
    "passage_count": count_score,
    "passage_retrieval_zh": retrieval_zh_score,
    "lcc": code_sim_score,
    "repobench-p": code_sim_score,
}

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--models_dir', type=str, required=True, help="Directory containing model directories")
    parser.add_argument('--output_dir', type=str, required=True, help="Directory to save evaluation results")
    return parser.parse_args()


def scorer(dataset, predictions, answers, all_classes):
    total_score = 0.
    for (prediction, ground_truths) in zip(predictions, answers):
        score = 0.
        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
            prediction = prediction.lstrip('\n').split('\n')[0]
        for ground_truth in ground_truths:
            score = max(score, dataset2metric[dataset](prediction.lower(), ground_truth.lower(), all_classes=all_classes))
        total_score += score
    return round(100 * total_score / len(predictions), 2)

def eval(path):
    scores = dict()
    if not path.endswith('/'):
        path += '/'
    
    all_files = os.listdir(path)
    print("Evaluating on:", all_files)

    for filename in all_files:
        if not filename.endswith("jsonl"):
            continue
        
        predictions, answers, lengths, all_classes = [], [], [], []
        dataset = filename.split('.')[0]
        
        with open(f"{path}{filename}", "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                if "context_length" in data and data["context_length"] < 28000:
                    predictions.append(data["pred"])
                    answers.append(data["answers"])
                    if "all_classes" in data:
                        all_classes = data["all_classes"]
                    if "length" in data:
                        lengths.append(data["length"])
                elif data["length"] <= 2000:
                    print(1)
                    predictions.append(data["pred"])
                    answers.append(data["answers"])
                    if "all_classes" in data:
                        all_classes = data["all_classes"]
                    if "length" in data:
                        lengths.append(data["length"])

        score = scorer(dataset, predictions, answers, all_classes)
        scores[dataset] = score
        print(f"{dataset}: {score}")
    
    return scores

# 设定模型目录和输出目录






1


CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python pred.py --model /home/yuhao/gsm8k-ScRel/Save_model/dolly/token_version_6.4_Orca_dolly

tinyllama context + instruction
67.5
23.09

tinyllama context

{
    "triviaqa": 64.95,
    "hotpotqa": 4.48
}


tinyllama base
2.45
0.23