### 1. 数据清洗

清洗输出，有以下几种类型：

1. 一些明显不合理的输出，如 'Material in cell culture: scaffold', 'Material in cell culture: not mentioned'

In [5]:
import json
import re

schema = {
    "Seeded cell density": [],
    "Number of cells": [],
    "Material in cell culture": [],
    "Concentration of material": [],
    "Chip material": [],
    "Cross-linking agent": [],
    "Pore size": [],
    "Diameter": [],
    "Manufacturing method": [],
    "Perfusion rate": [],
    "Channel width": [],
    "Channel height": []
}
def extract_json_between_markers(llm_output):
    json_start_marker = "{"
    json_end_marker = "}"

    # Find the start and end indices of the JSON string
    start_index = llm_output.find(json_start_marker)
    if start_index != -1:
        end_index = llm_output.find(json_end_marker, start_index)
    else:
        return schema  # JSON markers not found

    if end_index == -1:
        return schema  # End marker not found

    # Extract the JSON string
    json_string = llm_output[start_index:end_index+1].strip()
    json_string = json_string.replace("'", "\"")
    try:
        parsed_json = json.loads(json_string)
        return parsed_json
    except json.JSONDecodeError:
        return schema  # Invalid JSON format

In [17]:
# 清洗 json格式中键为 key 的数据， 数据集
input_path_gpt = 'data/inference_data/gpt_shot.json'
input_path_glm = 'data/inference_data/glm_shot.json'
input_path_llama = 'data/inference_data/llama_shot.json'
output_path = 'data/eval/eval.json'

def filter_empty_json(json_list):
    return {entity:value for entity,value in json_list.items() if len(value)}

with open(input_path_gpt,'r',encoding='utf-8') as f:
    gpt_data = json.load(f)

with open(input_path_glm,'r',encoding='utf-8') as f:
    glm_data = json.load(f)

with open(input_path_llama,'r',encoding='utf-8') as f:
    llama_data = json.load(f) 

final_data = [{} for i in range(len(glm_data))]
for i in range(len(glm_data)):
    # print(glm_data[i]['sentence'])
    final_data[i]['sentence'] = glm_data[i]['sentence']
    final_data[i]['target'] = filter_empty_json(gpt_data[i]['target'])
    final_data[i]['gpt'] = filter_empty_json(extract_json_between_markers(gpt_data[i]['output']))
    final_data[i]['glm'] = filter_empty_json(extract_json_between_markers(glm_data[i]['output']))
    final_data[i]['llama'] = filter_empty_json(extract_json_between_markers(llama_data[i]['output']))

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(final_data, f, ensure_ascii=False, indent=4)

In [16]:
import copy
# 清洗 json格式中键为 key 的数据， txt文件
json_path = 'data/inference_data/final_res_part_1.jsonl'
json_path2 = 'data/inference_data/final_res_part_2.jsonl'
output_path = 'data/eval/final_res.json'

# 过滤函数：过滤掉所有空的JSON对象
def filter_empty_json(json_list):
    return {entity:value for entity,value in json_list.items() if len(value)}

with open(json_path, 'r', encoding='utf-8') as f:
    data1 = [json.loads(line.strip()) for line in f]
with open(json_path2, 'r', encoding='utf-8') as f:
    data2 = [json.loads(line.strip()) for line in f]

data = data1 + data2
filtered_data = []
for entry in data:
    txt_path = entry.get('doi')
    txt_output = entry.get('output', [])
    filtered_txt_output = []
    
    for item in txt_output:
        formatted_output = extract_json_between_markers(item['output'])
        # print(formatted_output)
        filtered_output = filter_empty_json(formatted_output)
        
        # 如果过滤后的output不是空列表，则保留该条目
        if filtered_output:
            filtered_txt_output.append(filtered_output)

    temp = copy.deepcopy(schema)
    for item in filtered_txt_output:
        for entity,val in item.items():
            if entity in temp:
                temp[entity].extend(val)

    filtered_data.append({'doi': txt_path, 'output': {k:list(set(v)) for k,v in temp.items()}})
    
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=4)

### 2. TP,FP,FN 指标统计

1. TP: target + , output +
2. FP: target - , output +
3. FN: target + , output -
4. TN: target - , output -

In [31]:
# "Seeded cell density",'Number of cells','Channel height'

entities = ["Seeded cell density",
    "Material in cell culture", "Concentration of material", "Chip material", 
    "Cross-linking agent",
    "Pore size", "Diameter", "Manufacturing method", "Perfusion rate", 
    "Channel width",'Channel height'
]

In [30]:
import json

def convert_output(data):
    """
    Convert the given data (target or output) into a set of key-value pairs
    for comparison. Only include key-value pairs where the value is non-empty.
    """
    res = set()
    if not data:
        return res
    for key, value in data.items():
        if value and key in entities:  # 只处理值非空的字段
            for v in value:
                res.add((key, v))  # 将每个(key, value)对加入到集合中
    return res

def calculate_f1_components(clean_target, clean_output):
    """
    Calculate TP, FP, FN based on the target and output data.
    """
    tp = []
    fp = []
    fn = []
    
        
    # Convert target and output to sets of (key, value) pairs
    target_set = convert_output(clean_target)
    output_set = convert_output(clean_output)

    # Calculate TP, FP, FN
    tp_set = target_set.intersection(output_set)
    fp_set = output_set.difference(target_set)
    fn_set = target_set.difference(output_set)

    # Store the results
    tp.extend(tp_set)
    fp.extend(fp_set)
    fn.extend(fn_set)

    return [k for k,v in tp], [k for k,v in fp], [k for k,v in fn]


In [40]:
# 统计三个指标
json_path = 'data/eval/eval.json'
json_path = 'data/eval/sen_eval.json'
with open(json_path,'r',encoding='utf-8') as f:
    data = json.load(f)

for item in data:
    item['gpt_eval'],item['glm_eval'],item['llama_eval'] = {},{},{}
    item['gpt_eval']['TP'], item['gpt_eval']['FP'], item['gpt_eval']['FN'] =  calculate_f1_components(item["target"], item["gpt"])
    item['glm_eval']['TP'], item['glm_eval']['FP'], item['glm_eval']['FN'] =  calculate_f1_components(item["target"], item["glm"])
    item['llama_eval']['TP'], item['llama_eval']['FP'], item['llama_eval']['FN'] =  calculate_f1_components(item["target"], item["llama"])
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

### 3. 计算各实体的 F1 分数 

由于之前的指标统计只是进行简单匹配，有的地方可能会判断失误，需进行人工校准。

信息统计：结果存储在字典中，与excel统计一致：{entity1: [TP, FP, FN, TN, Accuracy, Precision, Recall, F1]}

In [26]:
def compute_f1(res,data,filed_name):
    for item in data:
        for entity in item[filed_name]['TP']:
            res[entity][0] += 1
        for entity in item[filed_name]['FP']:
            res[entity][1] += 1
        for entity in item[filed_name]['FN']:
            res[entity][2] += 1
        for entity in entities:
            if entity not in item[filed_name]['TP'] and entity not in item[filed_name]['FP'] and entity not in item[filed_name]['FN']:
                res[entity][3] += 1
    for entity, counts in res.items():
        TP, FP, FN, TN = counts[0], counts[1], counts[2], counts[3]
        # Accuracy
        res[entity][4] = round((TP + TN) / (TP + FP + FN + TN), 3) if (TP + FP + FN + TN) > 0 else -1
        # Precision
        res[entity][5] = round(TP / (TP + FP), 3) if (TP + FP) > 0 else -1
        # Recall
        res[entity][6] = round(TP / (TP + FN), 3) if (TP + FN) > 0 else -1
        # F1-Score
        res[entity][7] = round((2 * res[entity][5] * res[entity][6]) / (res[entity][5] + res[entity][6]), 3) if (res[entity][5] + res[entity][6]) > 0 else -1
    return res

In [48]:
with open(json_path,'r',encoding='utf-8') as f:
    data = json.load(f)
# ['TP', 'FP', 'FN', 'TN', 'Accuracy', 'Precision', 'Recall', 'F1-Score']
gpt_res = {entity: [0, 0, 0, 0, 0, 0, 0, 0] for entity in entities}
glm_res = {entity: [0, 0, 0, 0, 0, 0, 0, 0] for entity in entities}
llama_res = {entity: [0, 0, 0, 0, 0, 0, 0, 0] for entity in entities}

In [49]:
gpt_res = compute_f1(gpt_res,data,'gpt_eval')
glm_res = compute_f1(glm_res,data,'glm_eval')
llama_res = compute_f1(llama_res,data,'llama_eval')

In [50]:
final_data = {}
for entity in entities:
    final_data[entity] = [gpt_res[entity][-1],glm_res[entity][-1],llama_res[entity][-1]]
final_data

{'Seeded cell density': [0.725, 0.675, 0.58],
 'Material in cell culture': [0.887, 0.867, 0.705],
 'Concentration of material': [0.811, 0.454, 0.39],
 'Chip material': [0.909, 0.96, 0.891],
 'Cross-linking agent': [0.95, 0.74, 0.687],
 'Pore size': [0.75, 0.551, 0.692],
 'Diameter': [0.906, 0.793, 0.745],
 'Manufacturing method': [0.938, 0.656, 0.326],
 'Perfusion rate': [0.844, 0.82, 0.731],
 'Channel width': [0.909, 0.632, 0.5],
 'Channel height': [0.903, 0.593, 0.815]}

In [51]:
import pandas as pd

print('----  llama  ----')
df = pd.DataFrame.from_dict(final_data, orient='index',columns=['gpt','glm','llama'])
df

----  llama  ----


Unnamed: 0,gpt,glm,llama
Seeded cell density,0.725,0.675,0.58
Material in cell culture,0.887,0.867,0.705
Concentration of material,0.811,0.454,0.39
Chip material,0.909,0.96,0.891
Cross-linking agent,0.95,0.74,0.687
Pore size,0.75,0.551,0.692
Diameter,0.906,0.793,0.745
Manufacturing method,0.938,0.656,0.326
Perfusion rate,0.844,0.82,0.731
Channel width,0.909,0.632,0.5


In [54]:
glm_res

{'Seeded cell density': [28, 11, 16, 293, 0.922, 0.718, 0.636, 0.675],
 'Material in cell culture': [147, 8, 37, 205, 0.887, 0.948, 0.799, 0.867],
 'Concentration of material': [10, 1, 23, 308, 0.93, 0.909, 0.303, 0.454],
 'Chip material': [48, 1, 3, 290, 0.988, 0.98, 0.941, 0.96],
 'Cross-linking agent': [37, 0, 26, 288, 0.926, 1.0, 0.587, 0.74],
 'Pore size': [8, 6, 7, 323, 0.962, 0.571, 0.533, 0.551],
 'Diameter': [23, 3, 9, 309, 0.965, 0.885, 0.719, 0.793],
 'Manufacturing method': [21, 2, 20, 298, 0.935, 0.913, 0.512, 0.656],
 'Perfusion rate': [25, 4, 7, 308, 0.968, 0.862, 0.781, 0.82],
 'Channel width': [12, 2, 12, 317, 0.959, 0.857, 0.5, 0.632],
 'Channel height': [8, 4, 7, 323, 0.968, 0.667, 0.533, 0.593]}

In [55]:
# 计算 Macro F1
res =glm_res
f1_scores = [counts[7] for counts in res.values() if counts[7] != -1]
macro_f1 = round(sum(f1_scores) / len(f1_scores), 3) if f1_scores else -1

# 计算 Micro F1
total_tp = sum(counts[0] for counts in res.values())
total_fp = sum(counts[1] for counts in res.values())
total_fn = sum(counts[2] for counts in res.values())

micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else -1
micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else -1
micro_f1 = round((2 * micro_precision * micro_recall) / (micro_precision + micro_recall), 3) if (micro_precision + micro_recall) > 0 else -1

print(f"Macro F1: {macro_f1}")
print(f"Micro F1: {micro_f1}")

Macro F1: 0.704
Micro F1: 0.778
