In [3]:
import json
import re
import glob
from thucy.configuration import *

In [10]:
def read_verdict(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    match = re.search(rf"<verdict>(.*?)</verdict>", text, re.DOTALL)
    text = match.group(1).strip()
    return text

read_verdict("gpt-4o-mini/TabFact_1-29789-1.html.csv_0-trace_649936bc924440b887e9e243ec5dec19.txt")

'Verified'

# Measuring Accuracy Results

## TabFact Small Test Split

First, you can download all the Tab-Fact pairs from this [json](https://github.com/wenhuchen/Table-Fact-Checking/blob/master/tokenized_data/test_examples.json). Then, you can download the **small** split (i.e., 1998 indices) from this [json (small)](https://github.com/wenhuchen/Table-Fact-Checking/blob/master/data/small_test_id.json).

In the current directory, we already provide both of them: namely, `test_examples.json` and `small_test_id.json`. Let's load them up.

In [12]:
with open("test_examples.json", "r") as file:
    test_examples = json.load(file)

with open("small_test_id.json", "r") as file:
    test_ids = json.load(file)

In [16]:
next(iter(test_examples.items()))

('2-1570274-4.html.csv',
 [['tony lema be in the top 5 for the master tournament , the us open , and the open championship',
   'tournament that tony lema have participate in include the master tournament , the us open , the pga championship and the open championship',
   'the only tournament that tony lema win in be the open championship',
   'tony lema do not win in the us open',
   'tony lema make it to the top 10 in the pga championship , but do not continue on',
   'tony lema be in the top 5 for the pga championship , the us open , and the open championship',
   'tournament that tony lema have not participate in include the master tournament , the us open , the pga championship and the open championship',
   'tournament that tony lema won in be pga championship',
   'tony lema do not win in the pga championship',
   'tony lema make it to the top 10 in the us open , but do not continue on'],
  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
  'tony lema'])

In [18]:
test_ids[0]

'1-24560733-1.html.csv'

In [19]:
test_ids[0] in test_examples

True

Here, **1** means that the claim is **entailed**, and **0** means that the claim is **refuted**. You have probably noticed the awful grammar of the claims... this has been cleaned up by [Wang et al. (2024)](https://arxiv.org/pdf/2401.04398) which we will use in the future work (we do not use the cleaned up version in the paper).

In [29]:
models = ['gpt-5-mini', 'gpt-4o-mini']

for model in models:
    
    total_tests = 0
    correct_answers = 0
    
    for filename in glob.glob(f"{model}/TabFact*"):
        match = re.search(r'TabFact_(.+)_(\d+)-trace', filename)
        
        if not match:
            raise("What!?!?")
            
        test_file, test_i = match.groups()
    
        # if test not in the "small-set" skip!
        if test_file not in test_ids:
            continue
        
        test_i = int(test_i)

        # This is the ground truth
        answer_i = test_examples[test_file][1][test_i]

        # This is Thucy's verdict
        verdict_str = read_verdict(model + '/' + filename.split('/')[-1])
        
        # Thucy predicts "ENTAILED" when it is Verified or Partly Verified, else "REFUTED"
        verdict = 1 if verdict_str in ('Verified', 'Partly Verified') else 0
    
        total_tests += 1

        # Count correct answers
        correct_answers += int(verdict == answer_i)
    
    
    accuracy = correct_answers / total_tests
    print()
    print(f"-----------  Model: {model} -----------")
    print(f"Accuracy: {100*accuracy:.2f}% , Total Tests: {total_tests} , Wrong Answers: {total_tests - correct_answers}")


-----------  Model: gpt-5-mini -----------
Accuracy: 94.34% , Total Tests: 1998 , Wrong Answers: 113

-----------  Model: gpt-4o-mini -----------
Accuracy: 93.69% , Total Tests: 1998 , Wrong Answers: 126


# Recreating the Results of the paper

## Download TabFact CSV files

In [30]:
# TODO: