In [1]:
import json
import os

labels = json.load(open('C:\\Users\\PC\\CODE\\WDM-AI-TEMIS\\data-finetune\\final_data\\final.json', 'r', encoding='utf-8'))
vlm_res = json.load(open('C:\\Users\\PC\\CODE\\WDM-AI-TEMIS\\data-finetune\\qwen_results_final_updated.json', 'r', encoding='utf-8'))

In [2]:
import os

from dotenv import load_dotenv
from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    print("Lỗi: OPENAI_API_KEY không được tìm thấy.")
else:
    llm = ChatOpenAI(
        api_key=OPENAI_API_KEY,
        model="gpt-4o-mini",
        temperature=0,
        model_kwargs={
            "response_format": {"type": "json_object"}
        }
    )
    print("DEBUG: LLM initialized with JSON mode.")
    

from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel
from langchain_core.output_parsers import JsonOutputParser

def evaluate_table_extraction(ground_truth: str, predict: str, llm_model = None, debug: bool = False) -> any: # Thay đổi kiểu trả về nếu cần
   parser = JsonOutputParser()

   prompt_template_str = """
   You are provided with two markdown tables: a ground truth table and a predicted table.

Ground Truth Table:

{ground_truth}

Predicted Table:

{predict}

Evaluate the similarity between the two tables on a scale of 0 to 1, where 0 indicates no similarity and 1 indicates perfect match. For table fragments (tables without headers), prioritize the content and structure of the tables over the header values.

Provide a similarity score and a brief explanation of your reasoning.
You must respond with a JSON object containing only a score field with a float value between 0 and 1.
Remember: Your entire response must be a valid JSON object with only a "score" field containing a number between 0 and 1.
   """
   prompt = PromptTemplate.from_template(template=prompt_template_str)

   try:
      chain = prompt | llm_model | parser
      if debug:
         print("DEBUG: Chain created successfully.")

      invocation_payload = {
         "ground_truth": ground_truth,
         "predict": predict
      }
      if debug:
         print(f"DEBUG: Invoking chain with payload (first 100 chars of each): \nGround truth: {ground_truth[:100]}...\nPredict: {predict[:100]}...")

      # Để kiểm tra riêng lẻ LLM (bỏ qua parser tạm thời):
      # formatted_prompt = prompt.invoke(invocation_payload)
      # print(f"DEBUG: Formatted prompt sent to LLM:\n{formatted_prompt.to_string()}") # Hoặc .text tùy phiên bản
      # llm_output = llm_model.invoke(formatted_prompt)
      # print(f"DEBUG: Raw output from LLM:\n{llm_output}")
      # res = parser.parse(llm_output) # Parse thủ công nếu muốn kiểm tra parser

      res = chain.invoke(invocation_payload)
      if debug:
         print(f"DEBUG: Chain invocation successful. Result (res): {res}")
         print(f"DEBUG: Type of res: {type(res)}")
      return res

   except Exception as e:
      if debug:   
         print(f"ERROR in evaluate_table_extraction: {e}")
         import traceback
         traceback.print_exc() # In ra chi tiết lỗi và dòng gây lỗi
      return "" # Hoặc None, hoặc một giá trị báo lỗi cụ thể
    

DEBUG: LLM initialized with JSON mode.


In [3]:
from tqdm import tqdm

total_score = 0
for sample in tqdm(labels):
    image_path = sample['image_path']
    # search for the same image in vlm_res
    vlm_res_sample = None
    for vlm_sample in vlm_res:
        if vlm_sample['image_path'] == image_path:
            vlm_res_sample = vlm_sample
    if vlm_res_sample is None:
        print(f"Image {image_path} not found in vlm_res")
        continue
    
    
    # danh gia
    ground_truth = sample['markdown_content']
    predict = vlm_res_sample['markdown_content']
    
    score = evaluate_table_extraction(ground_truth, predict, llm_model=llm, debug=False)
    total_score += score['score']
    # break
    
print(f"Total score: {total_score/len(labels)}")

100%|██████████| 163/163 [02:45<00:00,  1.02s/it]

Total score: 0.8314561530133526





In [2]:
from tqdm import tqdm
from gemini import *
from gemini import generate
import time

total_score = 0
successful_evaluations = 0
max_retries = 3  # Maximum number of retries for handling connection errors

for sample in tqdm(labels):
    image_path = sample['image_path']
    # search for the same image in vlm_res
    vlm_res_sample = None
    for vlm_sample in vlm_res:
        if vlm_sample['image_path'] == image_path:
            vlm_res_sample = vlm_sample
    if vlm_res_sample is None:
        print(f"Image {image_path} not found in vlm_res")
        continue
    
    # danh gia
    ground_truth = sample['markdown_content']
    predict = vlm_res_sample['markdown_content']
    retries = 0
    while retries < max_retries:
        try:
            score = generate(ground_truth, predict)
            total_score += score
            successful_evaluations += 1
            break  # Exit the retry loop if successful
        except ConnectionResetError as e:
            print(f"Connection error: {e}. Retrying {retries + 1}/{max_retries}...")
            retries += 1
            time.sleep(2)  # Wait for 2 seconds before retrying
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break  # Exit the retry loop on unexpected errors

if successful_evaluations > 0:
    print(f"Total score: {total_score/successful_evaluations}")
else:
    print("No successful evaluations to calculate average score.")

  1%|          | 1/163 [00:00<00:57,  2.80it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  1%|          | 2/163 [00:00<00:48,  3.31it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  2%|▏         | 3/163 [00:00<00:46,  3.43it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  2%|▏         | 4/163 [00:01<00:46,  3.41it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  3%|▎         | 5/163 [00:01<00:45,  3.44it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  4%|▎         | 6/163 [00:01<00:44,  3.51it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  4%|▍         | 7/163 [00:02<00:43,  3.55it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  5%|▍         | 8/163 [00:02<00:44,  3.46it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  6%|▌         | 9/163 [00:02<00:49,  3.12it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  6%|▌         | 10/163 [00:02<00:46,  3.28it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  7%|▋         | 11/163 [00:03<00:44,  3.38it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  7%|▋         | 12/163 [00:03<00:44,  3.43it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  8%|▊         | 13/163 [00:03<00:41,  3.58it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  9%|▊         | 14/163 [00:04<00:43,  3.43it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  9%|▉         | 15/163 [00:04<00:45,  3.22it/s]

An unexpected error occurred: ('invalid_scope: Invalid OAuth scope or ID token audience provided.', {'error': 'invalid_scope', 'error_description': 'Invalid OAuth scope or ID token audience provided.'})


  9%|▉         | 15/163 [00:04<00:47,  3.13it/s]


KeyboardInterrupt: 