In [2]:
import os
import sys
import asyncio

import polars as pl
from jinja2 import Environment, FileSystemLoader
from openai import AsyncOpenAI

sys.path.append(os.path.abspath(os.path.join("..")))
from rag.openai_helpers import generate_prompts_from_text_list, get_num_tokens_from_string, calculate_token_pricing
from rag.parse_pdfs import analyze_pdf_table_content

pl.Config.set_fmt_str_lengths(5000)

polars.config.Config

In [3]:
FILE_PATH = "../data/2023-20-04_JO_7110.65AA.pdf"
GPT_4O_2024_08_06_PRICE_PER_MILLION_INPUT_TOKENS = 2.50
GPT_4O_2024_08_06_PRICE_PER_MILLION_OUTPUT_TOKENS = 10.00

In [4]:
client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Create Text Descriptions for Extracted Tables

In [5]:
df_tables = analyze_pdf_table_content(FILE_PATH)
print(df_tables.height)
df_tables.head()

98


page_number,contains_tables,table_number,extracted_table_text
i64,i64,i64,str
2,1,1,"""{'CHANGE\nTO\nBASIC': [None, None, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'SUPPLEMENTS': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], None: ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'OPTIONAL': [None, None, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']}"""
25,1,1,"""{'Basic or\nChange': ['JO 7110.65AA', 'Change 1', 'Change 2', 'Change 3', 'JO 7110.65BB', 'Change 1', 'Change 2', 'Change 3'], 'Cutoff Date for\nCompletion': ['11/3/22', '4/20/23', '10/5/23', '3/21/24', '9/5/24', '2/20/25', '8/7/25', '1/22/26'], 'Effective Date\nof Publication': ['4/20/23', '10/5/23', '3/21/24', '9/5/24', '2/20/25', '8/7/25', '1/22/26', '7/9/26']}"""
26,1,1,"""{'Military\nHeadquarters': ['U.S. Army\nUSAASA', 'U.S. Air Force\nHQ AFFSA', 'U.S. Navy\nCNO (N980A)'], 'DSN': ['656−4868', '884-5509', '224−2638'], 'Commercial': ['(703) 806−4868', '(405) 734-5509', '(703) 614−2638']}"""
27,1,1,"""{'Branch': ['U.S. Navy', 'U.S. Air Force', 'U.S. Army'], 'Address': ['Department of the Navy\nChief of Naval Operations\nN980A, NAATSEA\n2000 Navy Pentagon (5D453)\nWashington, D.C. 20350−2000', 'HQ AFFSA\n5316 S. Douglas Blvd\nBldg 8400, Room 232\nOklahoma City, OK 73150', 'Director\nUSAASA (MOAS−AS)\n9325 Gunston Road, Suite N319\nFt. Belvoir, VA 22060−5582']}"""
32,1,1,"""{'Abbreviation': ['AAR .......', 'AAR .......', 'AC ........', 'ACC .......', 'ACE−IDS . . .', 'ACL .......', 'ACLS ......', 'ADAR .....', 'ADC .......', 'ADIZ ......', 'ADR .......', 'ADS .......', 'ADS−B .....', 'ADS−C .....', 'AFP .......', 'AIDC ......', 'AIM .......', 'AIRMET . . .', 'ALDARS . . .', 'ALERFA . . .', 'ALNOT ....', 'ALS .......', 'ALTRV .....', 'AM ........', 'AMASS ....', 'AMB ......', 'AMVER ....', 'ANG .......', 'APR .......'], 'Meaning': ['Adapted arrival route', 'Airport arrival rate', 'Advisory Circular', 'Area Control Center', 'ASOS Controller Equipment− Information\nDisplay System', 'Aircraft list', 'Automatic Carrier Landing System', 'Adapted departure arrival route', 'Aerospace Defense Command', 'Air Defense Identification Zone (to be\npronounced “AY DIZ”)', 'Adapted departure route', 'Automatic Dependent Surveillance', 'Automatic Dependent\nSurveillance−Broadcast', 'Automatic Dependent\nSurveillance−Contract', 'Airspace Flow Program', 'ATS Interfacility Data Communications', 'Aeronautical Information Manual', 'Airmen’s meteorological information', 'Automated Lightning Detection and\nReporting System', 'Alert phase code (Alerting Service)', 'Alert notice', 'Approach Light System', 'Altitude reservation', 'Ambiguity−A disparity greater than a\nlocally adapted distance exists between the\nposition declared for a target by MEARTS\nand another facility’s computer declared\nposition during interfacility handoff', 'Airport Movement Area Safety System', 'Ambiguity−A disparity greater than a\nlocally adapted distance exists between the\nposition declared for a target by STARS\nand another facility’s computer declared\nposition during interfacility handoff', 'Automated Mutual Assistance Vessel\nRescue System', 'Air National Guard', 'ATC preferred route']}"""


### Example

In [6]:
df_tables[44]

page_number,contains_tables,table_number,extracted_table_text
i64,i64,i64,str
87,1,4,"""{'As Zero': ['“Field elevation one six zero.”\n“Heading three zero zero.”\n“One zero thousand five\nhundred.”'], 'As Group': ['“Western five thirty.”\n“EMAIR One Ten.”\n“Ten thousand five hundred.”']}"""


In [7]:
table = df_tables[44]["extracted_table_text"][0]
table

"{'As Zero': ['“Field elevation one six zero.”\\n“Heading three zero zero.”\\n“One zero thousand five\\nhundred.”'], 'As Group': ['“Western five thirty.”\\n“EMAIR One Ten.”\\n“Ten thousand five hundred.”']}"

In [8]:
environment = Environment(loader=FileSystemLoader("../prompts/"))
template = environment.get_template("table_to_text_prompt.jinja2")
table_to_text_prompt = template.render(text=table)
print(table_to_text_prompt)

Please provide a detailed description in paragraph format consisting of only sentences and no bullet points or lists going over the content of a table. 
This table is formatted as a Python dictionary in which each key represents a column name and the corresponding value is a list of strings representing the row values for that column.
Do not explicitly state that the table is formatted as a Python dictionary.

TABLE
-----
{'As Zero': ['“Field elevation one six zero.”\n“Heading three zero zero.”\n“One zero thousand five\nhundred.”'], 'As Group': ['“Western five thirty.”\n“EMAIR One Ten.”\n“Ten thousand five hundred.”']}


In [9]:
# chat_completion = await client.chat.completions.create(
#     messages=[
#             {
#                 "role": "user",
#                 "content": f"{table_to_text_prompt}",
#             }
#         ],
#         model="gpt-4o-2024-08-06",
#         temperature=0,
#         seed=42,
#     )
# print(chat_completion.choices[0].message.content)

<img src="images/2_obtain_text_descriptions_images_tables_example_1.png" alt="Original Table" width="1200" />

The table presents two columns, each containing a list of strings that represent different aviation-related phrases. The first column, titled "As Zero," includes phrases such as "Field elevation one six zero," "Heading three zero zero," and "One zero thousand five hundred." These phrases are typically used in aviation to communicate specific numerical values related to field elevation, heading, and altitude, with a focus on the use of zeros in the numbers. The second column, titled "As Group," features phrases like "Western five thirty," "EMAIR One Ten," and "Ten thousand five hundred." These phrases also convey numerical information but are expressed in a manner that groups numbers together, such as in flight numbers or altitudes, without emphasizing individual zeros. The content of these columns highlights the different ways numerical information can be communicated in aviation contexts, either by emphasizing each digit or by grouping numbers together for clarity and brevity.

### All Extracted Table Text

In [10]:
prompts = generate_prompts_from_text_list(
    texts=df_tables["extracted_table_text"].to_list(),
    template_path="../prompts/",
    prompt_file_name="table_to_text_prompt.jinja2",
)

### Input Token Cost

In [11]:
prompt_tokens = []
for prompt in prompts:
    num_tokens = get_num_tokens_from_string(prompt, encoding_name="gpt-4o")
    prompt_tokens.append(num_tokens)
total_prompt_tokens = sum(prompt_tokens)

total_input_token_cost = calculate_token_pricing(total_prompt_tokens, GPT_4O_2024_08_06_PRICE_PER_MILLION_INPUT_TOKENS)
print(f"The total cost for {total_prompt_tokens} input tokens is about ${total_input_token_cost}.")

The total cost for 22572 input tokens is about $0.06.


### Output Token Cost

In [13]:
output = """The table presents two columns, each containing a list of strings that represent different aviation-related phrases. The first column, titled "As Zero," includes phrases such as "Field elevation one six zero," "Heading three zero zero," and "One zero thousand five hundred." These phrases are typically used in aviation to communicate specific numerical values related to field elevation, heading, and altitude, with a focus on the use of zeros in the numbers. The second column, titled "As Group," features phrases like "Western five thirty," "EMAIR One Ten," and "Ten thousand five hundred." These phrases also convey numerical information but are expressed in a manner that groups numbers together, such as in flight numbers or altitudes, without emphasizing individual zeros. The content of these columns highlights the different ways numerical information can be communicated in aviation contexts, either by emphasizing each digit or by grouping numbers together for clarity and brevity."""
output_tokens = get_num_tokens_from_string(output, encoding_name="gpt-4o")
total_output_tokens_estimate = len(prompts) * (output_tokens * 4) # Multiplying by 4 for a worst case scenario output estimate

total_output_token_estimate_cost = calculate_token_pricing(total_output_tokens_estimate, GPT_4O_2024_08_06_PRICE_PER_MILLION_OUTPUT_TOKENS)
print(f"The total cost for {total_output_tokens_estimate} output tokens is about ${total_output_token_estimate_cost}.")

The total cost for 71344 output tokens is about $0.71.


### Estimated Total Cost

In [14]:
print(total_input_token_cost + total_output_token_estimate_cost)

0.77


### OpenAI API Calls