Set-Up

In [None]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))

from scripts import request, download, stats
from scripts.constants import Constants
from scripts.table.constants import TableConstants
from scripts.table import extraction, types

project_path = os.path.abspath(os.path.expandvars(Constants.PROJECT_PATH))
experiments_path = os.path.join(project_path, Constants.Directories.EXPERIMENTS)
extracted_tables_path = os.path.join(experiments_path, Constants.Directories.EXTRACTED_TABLE)

connection_info = request.extract_infos('../private.json')

Download Articles

In [None]:
article_dir = "CS_Test"
articles_path = os.path.join(experiments_path, Constants.Directories.ARTICLES, article_dir)

json_file_name = f"{article_dir}.json"

In [None]:
# Download articles as HTML files
search_query = '"Entity Matching"'
download.get_articles(search_query, articles_path)

In [None]:
extraction.extract_and_save_tables(articles_path, extracted_tables_path, json_file_name)

Claim Extraction

In [None]:
output_dir = "label_output/caption_citation_new"
tables_file_name = 'CS_GroundTruth_Labeling_All_Values.json'
msgs_dir = "CS_Label/Caption_Citation_NEW"

output_path = os.path.join(experiments_path, Constants.Directories.OUTPUT, output_dir)
tables_file_path = os.path.join(extracted_tables_path, tables_file_name)
msgs_path = os.path.join(project_path, Constants.Directories.MESSAGES, msgs_dir)

In [None]:
extraction.check_extracted_data(tables_file_path)

In [None]:
request.set_up_test_dir(output_path, tables_file_path, msgs_path)
test_info = request.get_test_info(output_path)

In [None]:
request.run_test(connection_info, test_info, num_thread=50, max_cycles=3)

Ground Truth

In [None]:
gt_path = os.path.join(experiments_path, Constants.Directories.GROUND_TRUTH)
gt_file = os.path.join(gt_path, "table_labeled_by_claim_structure.ods")
gt_answer_path = os.path.join(gt_path, "labeling_structure")

stats.write_ground_truth(gt_file, gt_answer_path)

In [None]:
output_dir = os.path.join(experiments_path, Constants.Directories.OUTPUT, "label_output")
comparison_path = os.path.join(experiments_path, Constants.Directories.COMPARISONS)

In [None]:
"""
caption_dir = os.path.join(output_dir, "caption")
html_dir = os.path.join(output_dir, "html")
citation_dir = os.path.join(output_dir, "citation")
caption_citation_dir = os.path.join(output_dir, "caption_citation")
"""
caption_dir = os.path.join(output_dir, "caption_new")
caption_citation_dir = os.path.join(output_dir, "caption_citation_new")

output_dirs = [
    (caption_dir, 1),
#    (html_dir, 1),
#    (citation_dir, 1),
    (caption_citation_dir, 1)
]

# stats.compare_multiple_results(gt_answer_path, output_dirs, types.compare_table_types, save_path)
stats.compare_multiple_results(
    gt_answer_path, 
    output_dirs, 
    types.compare_table_types, 
    comparison_path, 
    {TableConstants.Attributes.TYPES: Constants.Claims.CLAIM_STRUCTURES}
)

In [None]:
original_prompt = os.path.join(output_dir, "original_prompt", "1")
types.check_claims_types(gt_answer_path, original_prompt, os.path.join(comparison_path, "original_1"))