-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_evaluation.py
104 lines (87 loc) · 5.2 KB
/
run_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import pandas as pd
from parsee.datasets.evaluation.main import *
from parsee.cloud.api import ParseeCloud
from parsee.datasets.readers.disk_reader import SimpleCsvDiskReader
from parsee.extraction.models.helpers import *
from parsee.datasets.writers.disk_writer import CsvDiskWriter
from parsee.storage.in_memory_storage import InMemoryStorageManager
from parsee.converters.image_creation import DiskImageReader
from parsee.extraction.extractor_dataclasses import ParseeAnswer
"""
We are using this custom evaluation function for the results,
where we basically multiply all results with their units and then check if the values deviate not more than 0.1%
"""
def results_compare_function(answer1: ParseeAnswer, answer2: ParseeAnswer) -> bool:
if answer1.class_value == "n/a" and answer2.class_value == "n/a":
return True
elif answer1.class_value == "n/a" or answer2.class_value == "n/a":
return False
mults = {
"none": 1,
"thousands": 1000,
"millions": 1000000,
"billions": 1000000000
}
unit1 = [x for x in answer1.meta if x.class_id == 'unit']
unit_mult1 = 1 if len(unit1) == 0 else (mults[unit1[0].class_value] if unit1[0].class_value in mults else 1)
val1 = int(float(answer1.class_value)*unit_mult1)
unit2 = [x for x in answer2.meta if x.class_id == 'unit']
unit_mult2 = 1 if len(unit2) == 0 else (mults[unit2[0].class_value] if unit2[0].class_value in mults else 1)
val2 = int(float(answer2.class_value) * unit_mult2)
diff_rel = (abs(val1-val2) / val1)
return diff_rel < 0.001
"""
requires the following variables set in .env in order to re-run the evaluation (for new files, if not the cached answers are being used):
PARSEE_API_KEY -> api key for parsee cloud
"""
def run_eval_by_file(template_id: str, csv_file_path: str, writer_path: Optional[str], models: List[MlModelSpecification], use_cached_only: bool = False, images_dir: Optional[str] = None):
custom_storage = None if not models[0].multimodal else InMemoryStorageManager(models, DiskImageReader(images_dir))
cloud = ParseeCloud(os.getenv('PARSEE_API_KEY') if not use_cached_only else None)
template = cloud.get_template(template_id)
reader = SimpleCsvDiskReader(csv_file_path)
writer = None
writer_file_name = None
if writer_path is not None and not use_cached_only:
writer = CsvDiskWriter(writer_path, False)
writer_file_name = os.path.basename(csv_file_path).split(".")[0] + "_with_answers"
performance = evaluate_llm_performance(template, reader, models, custom_storage, writer, True, writer_file_name, {"rev_meta": results_compare_function, "rev23_meta": results_compare_function, "rev22_meta": results_compare_function , "rev23_thousands_no_hint": results_compare_function, "rev22_millions_no_hint": results_compare_function, "rev23_thousands_hint": results_compare_function, "rev23_millions_no_hint": results_compare_function}, ["unit"], not use_cached_only)
return performance
def make_df_single_dataset(template_id: str, dataset_path: str, models: List[MlModelSpecification]) -> pd.DataFrame:
stats = run_eval_by_file(template_id, dataset_path, None, models, use_cached_only=True)
all_values = []
for model, values in stats.items():
if model != "assigned":
values = {"model": model, **values}
all_values.append(values)
return pd.DataFrame(all_values)
def make_df(template_id: str, folder_path: str, models: List[MlModelSpecification]) -> Tuple[pd.DataFrame, List[str]]:
all_files = [x for x in os.listdir(folder_path) if x.endswith(".csv")]
all_dataset_names = [x.replace("_with_answers.csv", "") for x in all_files]
relevant_keys = {"total_correct_percent", "total_correct_meta_found_percent", "total_correct_percent_ex_missing", "total_correct_meta_found_percent_ex_missing", "total"}
# collect all results
all_results = {}
total_by_model = {}
for file_idx, file in enumerate(all_files):
full_path = os.path.join(folder_path, file)
stats = run_eval_by_file(template_id, full_path, None, models, use_cached_only=True)
dataset_name = all_dataset_names[file_idx]
for model_name, results in stats.items():
# we are putting a weight to the meta info of 1/4 (3/4 to the 'main' question) as the actual numbers are more important
results["total"] = results["total_correct_percent"] if results["total_correct_meta_found_percent"] is None else (
(results["total_correct_percent"]*3 + results["total_correct_meta_found_percent"]) / 4)
if model_name not in all_results:
all_results[model_name] = {}
total_by_model[model_name] = 0
for key in relevant_keys:
final_key = f"{dataset_name}_{key}"
all_results[model_name][final_key] = results[key]
if key == "total":
total_by_model[model_name] += results[key]
arr = []
for model_name, values in all_results.items():
if model_name != "assigned":
values = {"model": model_name, "total": total_by_model[model_name] / len(all_files), **values}
arr.append(values)
df = pd.DataFrame(arr)
return df, all_dataset_names