In [1]:
import os
import sys

import polars as pl
from openai import OpenAI
from jinja2 import Environment, FileSystemLoader

sys.path.append(os.path.abspath(os.path.join("..")))
from rag.openai_helpers import (
    generate_prompts_from_text_list, 
    get_num_tokens_from_string, 
    calculate_token_pricing,
    write_requests_to_jsonl,
    create_openai_batch_process,
    check_openai_batch_status,
    read_batch_output_jsonl_to_polars
)
from rag.parse_pdfs import analyze_pdf_table_content

pl.Config.set_fmt_str_lengths(5000)

polars.config.Config

In [2]:
FILE_PATH = "../data/2023-20-04_JO_7110.65AA.pdf"

GPT_4O_2024_08_06_PRICE_PER_MILLION_INPUT_TOKENS = 2.50
GPT_4O_2024_08_06_PRICE_PER_MILLION_OUTPUT_TOKENS = 10.00

GPT_4O_2024_08_06_BATCH_PRICE_PER_MILLION_INPUT_TOKENS = 1.25
GPT_4O_2024_08_06_BATCH_PRICE_PER_MILLION_OUTPUT_TOKENS = 5.00

# Obtain Text Descriptions for Extracted Tables

In [3]:
df_tables = analyze_pdf_table_content(FILE_PATH)
print(df_tables.height)
df_tables.head()

98


page_number,contains_tables,table_number,extracted_table_text
i64,i64,i64,str
2,1,1,"""{'CHANGE\nTO\nBASIC': [None, None, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'SUPPLEMENTS': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], None: ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'OPTIONAL': [None, None, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']}"""
25,1,1,"""{'Basic or\nChange': ['JO 7110.65AA', 'Change 1', 'Change 2', 'Change 3', 'JO 7110.65BB', 'Change 1', 'Change 2', 'Change 3'], 'Cutoff Date for\nCompletion': ['11/3/22', '4/20/23', '10/5/23', '3/21/24', '9/5/24', '2/20/25', '8/7/25', '1/22/26'], 'Effective Date\nof Publication': ['4/20/23', '10/5/23', '3/21/24', '9/5/24', '2/20/25', '8/7/25', '1/22/26', '7/9/26']}"""
26,1,1,"""{'Military\nHeadquarters': ['U.S. Army\nUSAASA', 'U.S. Air Force\nHQ AFFSA', 'U.S. Navy\nCNO (N980A)'], 'DSN': ['656−4868', '884-5509', '224−2638'], 'Commercial': ['(703) 806−4868', '(405) 734-5509', '(703) 614−2638']}"""
27,1,1,"""{'Branch': ['U.S. Navy', 'U.S. Air Force', 'U.S. Army'], 'Address': ['Department of the Navy\nChief of Naval Operations\nN980A, NAATSEA\n2000 Navy Pentagon (5D453)\nWashington, D.C. 20350−2000', 'HQ AFFSA\n5316 S. Douglas Blvd\nBldg 8400, Room 232\nOklahoma City, OK 73150', 'Director\nUSAASA (MOAS−AS)\n9325 Gunston Road, Suite N319\nFt. Belvoir, VA 22060−5582']}"""
32,1,1,"""{'Abbreviation': ['AAR .......', 'AAR .......', 'AC ........', 'ACC .......', 'ACE−IDS . . .', 'ACL .......', 'ACLS ......', 'ADAR .....', 'ADC .......', 'ADIZ ......', 'ADR .......', 'ADS .......', 'ADS−B .....', 'ADS−C .....', 'AFP .......', 'AIDC ......', 'AIM .......', 'AIRMET . . .', 'ALDARS . . .', 'ALERFA . . .', 'ALNOT ....', 'ALS .......', 'ALTRV .....', 'AM ........', 'AMASS ....', 'AMB ......', 'AMVER ....', 'ANG .......', 'APR .......'], 'Meaning': ['Adapted arrival route', 'Airport arrival rate', 'Advisory Circular', 'Area Control Center', 'ASOS Controller Equipment− Information\nDisplay System', 'Aircraft list', 'Automatic Carrier Landing System', 'Adapted departure arrival route', 'Aerospace Defense Command', 'Air Defense Identification Zone (to be\npronounced “AY DIZ”)', 'Adapted departure route', 'Automatic Dependent Surveillance', 'Automatic Dependent\nSurveillance−Broadcast', 'Automatic Dependent\nSurveillance−Contract', 'Airspace Flow Program', 'ATS Interfacility Data Communications', 'Aeronautical Information Manual', 'Airmen’s meteorological information', 'Automated Lightning Detection and\nReporting System', 'Alert phase code (Alerting Service)', 'Alert notice', 'Approach Light System', 'Altitude reservation', 'Ambiguity−A disparity greater than a\nlocally adapted distance exists between the\nposition declared for a target by MEARTS\nand another facility’s computer declared\nposition during interfacility handoff', 'Airport Movement Area Safety System', 'Ambiguity−A disparity greater than a\nlocally adapted distance exists between the\nposition declared for a target by STARS\nand another facility’s computer declared\nposition during interfacility handoff', 'Automated Mutual Assistance Vessel\nRescue System', 'Air National Guard', 'ATC preferred route']}"""


## Example Table Text

In [4]:
df_tables[44]

page_number,contains_tables,table_number,extracted_table_text
i64,i64,i64,str
87,1,4,"""{'As Zero': ['“Field elevation one six zero.”\n“Heading three zero zero.”\n“One zero thousand five\nhundred.”'], 'As Group': ['“Western five thirty.”\n“EMAIR One Ten.”\n“Ten thousand five hundred.”']}"""


In [5]:
table = df_tables[44]["extracted_table_text"][0]
table

"{'As Zero': ['“Field elevation one six zero.”\\n“Heading three zero zero.”\\n“One zero thousand five\\nhundred.”'], 'As Group': ['“Western five thirty.”\\n“EMAIR One Ten.”\\n“Ten thousand five hundred.”']}"

In [6]:
environment = Environment(loader=FileSystemLoader("../prompts/"))
template = environment.get_template("table_to_text_prompt.jinja2")
table_to_text_prompt = template.render(text=table)
print(table_to_text_prompt)

Please provide a detailed description in paragraph format consisting of only sentences and no bullet points or lists going over the content of a table. 
This table is formatted as a Python dictionary in which each key represents a column name and the corresponding value is a list of strings representing the row values for that column.
Do not explicitly state that the table is formatted as a Python dictionary.

TABLE
-----
{'As Zero': ['“Field elevation one six zero.”\n“Heading three zero zero.”\n“One zero thousand five\nhundred.”'], 'As Group': ['“Western five thirty.”\n“EMAIR One Ten.”\n“Ten thousand five hundred.”']}


In [None]:
# client = OpenAI(
#     api_key=os.environ.get("OPENAI_API_KEY"),
# )

In [None]:
# chat_completion = client.chat.completions.create(
#     messages=[
#             {
#                 "role": "user",
#                 "content": f"{table_to_text_prompt}",
#             }
#         ],
#         model="gpt-4o-2024-08-06",
#         temperature=0,
#         seed=42,
#     )
# print(chat_completion.choices[0].message.content)

<img src="images/2_obtain_text_descriptions_images_tables_example_1.png" alt="Original Table" width="1200" />

The table presents two columns, each containing a list of strings that represent different aviation-related phrases. The first column, titled "As Zero," includes phrases such as "Field elevation one six zero," "Heading three zero zero," and "One zero thousand five hundred." These phrases are typically used in aviation to communicate specific numerical values related to field elevation, heading, and altitude, with a focus on the use of zeros in the numbers. The second column, titled "As Group," features phrases like "Western five thirty," "EMAIR One Ten," and "Ten thousand five hundred." These phrases also convey numerical information but are expressed in a manner that groups numbers together, such as in flight numbers or altitudes, without emphasizing individual zeros. The content of these columns highlights the different ways numerical information can be communicated in aviation contexts, either by emphasizing each digit or by grouping numbers together for clarity and brevity.

## All Extracted Table Text

In [7]:
prompts = generate_prompts_from_text_list(
    texts=df_tables["extracted_table_text"].to_list(),
    template_path="../prompts/",
    prompt_file_name="table_to_text_prompt.jinja2",
)

### Input Token Cost

In [8]:
prompt_tokens = []
for prompt in prompts:
    num_tokens = get_num_tokens_from_string(prompt, encoding_name="gpt-4o")
    prompt_tokens.append(num_tokens)
total_prompt_tokens = sum(prompt_tokens)

total_input_token_cost = calculate_token_pricing(total_prompt_tokens, GPT_4O_2024_08_06_PRICE_PER_MILLION_INPUT_TOKENS)
print(f"The cost for {total_prompt_tokens} input tokens is about ${total_input_token_cost}.")

total_input_token_batch_cost = calculate_token_pricing(
    total_prompt_tokens, GPT_4O_2024_08_06_BATCH_PRICE_PER_MILLION_INPUT_TOKENS
)
print(f"The batch cost for {total_prompt_tokens} input tokens is about ${total_input_token_batch_cost}.")

The cost for 22572 input tokens is about $0.06.
The batch cost for 22572 input tokens is about $0.03.


### Output Token Cost

In [9]:
output = """The table presents two columns, each containing a list of strings that represent different aviation-related phrases. The first column, titled "As Zero," includes phrases such as "Field elevation one six zero," "Heading three zero zero," and "One zero thousand five hundred." These phrases are typically used in aviation to communicate specific numerical values related to field elevation, heading, and altitude, with a focus on the use of zeros in the numbers. The second column, titled "As Group," features phrases like "Western five thirty," "EMAIR One Ten," and "Ten thousand five hundred." These phrases also convey numerical information but are expressed in a manner that groups numbers together, such as in flight numbers or altitudes, without emphasizing individual zeros. The content of these columns highlights the different ways numerical information can be communicated in aviation contexts, either by emphasizing each digit or by grouping numbers together for clarity and brevity."""
output_tokens = get_num_tokens_from_string(output, encoding_name="gpt-4o")
total_output_tokens_estimate = len(prompts) * 1000 # Multiplying by 1000 for max_tokens

total_output_token_estimate_cost = calculate_token_pricing(
    total_output_tokens_estimate, GPT_4O_2024_08_06_PRICE_PER_MILLION_OUTPUT_TOKENS
)
print(f"The cost for {total_output_tokens_estimate} output tokens is about ${total_output_token_estimate_cost}.")

total_output_token_batch_estimate_cost = calculate_token_pricing(
    total_output_tokens_estimate, GPT_4O_2024_08_06_BATCH_PRICE_PER_MILLION_OUTPUT_TOKENS
)
print(
    f"The batch cost for {total_output_tokens_estimate} output tokens is about ${total_output_token_batch_estimate_cost}."
)

The cost for 98000 output tokens is about $0.98.
The batch cost for 98000 output tokens is about $0.49.


### Estimated Total Cost

In [10]:
print(f"Total: ${total_input_token_cost + total_output_token_estimate_cost}")
print(f"Batch Total: ${total_input_token_batch_cost + total_output_token_batch_estimate_cost}")

Total: $1.04
Batch Total: $0.52


### OpenAI Batch

In [None]:
write_requests_to_jsonl(prompts=prompts, base_output_path="../data/batch_table_to_text", batch_request_file="tables_to_text")

In [None]:
# create_openai_batch_process(
#     api_key=os.environ.get("OPENAI_API_KEY"), 
#     batch_request_file="../data/batch_table_to_text/tables_to_text_1.jsonl", 
#     batch_response_file="../data/batch_table_to_text/batch_response_tables_to_text_1.json", 
#     description="tables to text 1"
#     )

In [None]:
check_openai_batch_status(
    api_key=os.environ.get("OPENAI_API_KEY"), 
    batch_response_file="../data/batch_table_to_text/batch_response_tables_to_text_1.json", 
    output_file="../data/batch_table_to_text/batch_output_tables_to_text_1.jsonl"
    )

In [11]:
print("Actual Batch Total: $0.16")

Actual Batch Total: $0.16


### Output

In [12]:
df_tables = read_batch_output_jsonl_to_polars("../data/batch_table_to_text/batch_output_tables_to_text_1.jsonl")
df_tables.head()

id,custom_id,content
str,str,str
"""batch_req_GAq5cBQ1HT6AZ1Sd24cZVKjj""","""prompt_0""","""The table consists of four columns, each with a distinct heading, although one of the columns lacks a defined name. The first column is labeled ""CHANGE\nTO\nBASIC"" and contains a series of entries, most of which are either `None` or empty strings, indicating a lack of data or changes in this category. The second column, titled ""SUPPLEMENTS,"" is filled entirely with empty strings, suggesting that no supplementary information is provided for any of the entries. The third column is unnamed, represented by a `None` key, and similarly contains only empty strings, further emphasizing the absence of data or additional details. The final column, ""OPTIONAL,"" mirrors the pattern seen in the ""CHANGE\nTO\nBASIC"" column, with entries predominantly consisting of `None` or empty strings, indicating that optional information is either not applicable or not provided. Overall, the table appears to be largely devoid of substantive content across all columns, with the majority of entries being empty or undefined."""
"""batch_req_v7VjF65zNYLlbJJT5l453Wm1""","""prompt_1""","""The table presents a structured overview of a series of documents and their respective timelines for completion and publication. It begins with a column labeled ""Basic or Change,"" which lists a sequence of documents starting with ""JO 7110.65AA,"" followed by three subsequent changes labeled ""Change 1,"" ""Change 2,"" and ""Change 3."" This pattern repeats for another document, ""JO 7110.65BB,"" which is also followed by three changes. The next column, ""Cutoff Date for Completion,"" provides specific dates by which each document or change must be completed. These dates range from November 3, 2022, for the initial document, to January 22, 2026, for the last change listed. The final column, ""Effective Date of Publication,"" indicates when each document or change is officially published. The publication dates start on April 20, 2023, and extend to July 9, 2026. This structured format allows for a clear understanding of the timeline and progression of each document and its changes, highlighting the relationship between the completion and publication dates."""
"""batch_req_LcWzRXniPYQURjF1o3JRJYkr""","""prompt_2""","""The table presents information about various military headquarters, their DSN numbers, and corresponding commercial contact numbers. The first column lists the military headquarters, starting with the U.S. Army, specifically the USAASA, followed by the U.S. Air Force, identified as HQ AFFSA, and finally the U.S. Navy, noted as CNO (N980A). The second column provides the DSN numbers for each of these headquarters, with the U.S. Army having the DSN number 656−4868, the U.S. Air Force listed with 884-5509, and the U.S. Navy with 224−2638. The third column details the commercial phone numbers associated with each headquarters, where the U.S. Army can be reached at (703) 806−4868, the U.S. Air Force at (405) 734-5509, and the U.S. Navy at (703) 614−2638. This structured information allows for easy reference to contact details for these military entities."""
"""batch_req_DWfWSkuqNq6Ys1jqS8ciUoc5""","""prompt_3""","""The table contains information about three branches of the United States military, specifically the U.S. Navy, U.S. Air Force, and U.S. Army, along with their respective addresses. The U.S. Navy's address is listed as the Department of the Navy, Chief of Naval Operations, N980A, NAATSEA, located at 2000 Navy Pentagon (5D453) in Washington, D.C., with the postal code 20350−2000. The U.S. Air Force's address is given as HQ AFFSA, situated at 5316 S. Douglas Blvd, Bldg 8400, Room 232, in Oklahoma City, OK, with the postal code 73150. Lastly, the U.S. Army's address is noted as the Director, USAASA (MOAS−AS), at 9325 Gunston Road, Suite N319, in Ft. Belvoir, VA, with the postal code 22060−5582. Each address provides a detailed location for the respective branch's headquarters or main office, indicating specific departments or offices within larger complexes."""
"""batch_req_bIvlPpHSFNX1puN5wDUnOx1O""","""prompt_4""","""The table presents a collection of abbreviations alongside their corresponding meanings, offering a glimpse into various aviation and aeronautical terminologies. The abbreviation ""AAR"" is associated with two different meanings: ""Adapted arrival route"" and ""Airport arrival rate,"" indicating its use in different contexts. ""AC"" stands for ""Advisory Circular,"" while ""ACC"" refers to ""Area Control Center,"" both of which are crucial in aviation operations. The abbreviation ""ACE−IDS"" is expanded to ""ASOS Controller Equipment− Information Display System,"" highlighting its role in information management. ""ACL"" is short for ""Aircraft list,"" and ""ACLS"" denotes the ""Automatic Carrier Landing System,"" both essential for aircraft operations. ""ADAR"" is an abbreviation for ""Adapted departure arrival route,"" and ""ADC"" stands for ""Aerospace Defense Command,"" reflecting their specific functions in aviation. The term ""ADIZ"" is pronounced as ""AY DIZ"" and refers to the ""Air Defense Identification Zone,"" a critical area for national security. ""ADR"" is the ""Adapted departure route,"" while ""ADS"" stands for ""Automatic Dependent Surveillance,"" a key component in modern air traffic management. The abbreviations ""ADS−B"" and ""ADS−C"" represent ""Automatic Dependent Surveillance−Broadcast"" and ""Automatic Dependent Surveillance−Contract,"" respectively, both of which are advancements in surveillance technology. ""AFP"" is the ""Airspace Flow Program,"" and ""AIDC"" stands for ""ATS Interfacility Data Communications,"" both of which facilitate efficient airspace management. ""AIM"" refers to the ""Aeronautical Information Manual,"" a vital resource for pilots and aviation professionals. ""AIRMET"" is an abbreviation for ""Airmen’s meteorological information,"" providing essential weather updates. ""ALDARS"" stands for the ""Automated Lightning Detection and Reporting System,"" crucial for weather monitoring. ""ALERFA"" is the ""Alert phase code (Alerting Service),"" and ""ALNOT"" is an ""Alert notice,"" both important for emergency response. ""ALS"" refers to the ""Approach Light System,"" aiding in aircraft landing. ""ALTRV"" stands for ""Altitude reservation,"" a term used in air traffic control. The abbreviation ""AM"" has a complex meaning related to ambiguity in target positioning, specifically ""Ambiguity−A disparity greater than a locally adapted distance exists between the position declared for a target by MEARTS and another facility’s computer declared position during interfacility handoff."" ""AMASS"" is the ""Airport Movement Area Safety System,"" enhancing airport safety. Another instance of ""AMB"" refers to a similar ambiguity issue, this time involving ""STARS"" instead of ""MEARTS."" ""AMVER"" stands for the ""Automated Mutual Assistance Vessel Rescue System,"" a maritime safety initiative. ""ANG"" is the ""Air National Guard,"" a military reserve force. Lastly, ""APR"" is the ""ATC preferred route,"" indicating a route preferred by air traffic control for efficiency."""
