In [1]:
import os
import sys

import polars as pl
from langchain_text_splitters import RecursiveCharacterTextSplitter

sys.path.append(os.path.abspath(os.path.join("..")))
from rag.parse_pdfs import analyze_pdf_text_content
from rag.openai_helpers import (
    get_num_tokens_from_string,
    calculate_token_pricing,
    create_embedding_request,
    write_requests_to_jsonl,
    create_openai_batch_process,
    check_openai_batch_status,
    read_batch_chat_completions_output_jsonl_to_polars,
    read_batch_embeddings_output_jsonl_to_polars,
)

pl.Config.set_fmt_str_lengths(5000)

polars.config.Config

In [2]:
TEXT_EMBEDDING_3_SMALL_PRICE_PER_MILLION_INPUT_TOKENS = 0.020
TEXT_EMBEDDING_3_BATCH_PRICE_PER_MILLION_INPUT_TOKENS = 0.010

# Table Text Descriptions

In [3]:
df_tables = read_batch_chat_completions_output_jsonl_to_polars(
    "../data/batch_table_to_text/batch_output_tables_to_text_1.jsonl"
)
df_tables.head()

id,custom_id,content
str,str,str
"""batch_req_GAq5cBQ1HT6AZ1Sd24cZVKjj""","""prompt_0""","""The table consists of four columns, each with a distinct heading, although one of the columns lacks a defined name. The first column is labeled ""CHANGE\nTO\nBASIC"" and contains a series of entries, most of which are either `None` or empty strings, indicating a lack of data or changes in this category. The second column, titled ""SUPPLEMENTS,"" is filled entirely with empty strings, suggesting that no supplementary information is provided for any of the entries. The third column is unnamed, represented by a `None` key, and similarly contains only empty strings, further emphasizing the absence of data or additional details. The final column, ""OPTIONAL,"" mirrors the pattern seen in the ""CHANGE\nTO\nBASIC"" column, with entries predominantly consisting of `None` or empty strings, indicating that optional information is either not applicable or not provided. Overall, the table appears to be largely devoid of substantive content across all columns, with the majority of entries being empty or undefined."""
"""batch_req_v7VjF65zNYLlbJJT5l453Wm1""","""prompt_1""","""The table presents a structured overview of a series of documents and their respective timelines for completion and publication. It begins with a column labeled ""Basic or Change,"" which lists a sequence of documents starting with ""JO 7110.65AA,"" followed by three subsequent changes labeled ""Change 1,"" ""Change 2,"" and ""Change 3."" This pattern repeats for another document, ""JO 7110.65BB,"" which is also followed by three changes. The next column, ""Cutoff Date for Completion,"" provides specific dates by which each document or change must be completed. These dates range from November 3, 2022, for the initial document, to January 22, 2026, for the last change listed. The final column, ""Effective Date of Publication,"" indicates when each document or change is officially published. The publication dates start on April 20, 2023, and extend to July 9, 2026. This structured format allows for a clear understanding of the timeline and progression of each document and its changes, highlighting the relationship between the completion and publication dates."""
"""batch_req_LcWzRXniPYQURjF1o3JRJYkr""","""prompt_2""","""The table presents information about various military headquarters, their DSN numbers, and corresponding commercial contact numbers. The first column lists the military headquarters, starting with the U.S. Army, specifically the USAASA, followed by the U.S. Air Force, identified as HQ AFFSA, and finally the U.S. Navy, noted as CNO (N980A). The second column provides the DSN numbers for each of these headquarters, with the U.S. Army having the DSN number 656−4868, the U.S. Air Force listed with 884-5509, and the U.S. Navy with 224−2638. The third column details the commercial phone numbers associated with each headquarters, where the U.S. Army can be reached at (703) 806−4868, the U.S. Air Force at (405) 734-5509, and the U.S. Navy at (703) 614−2638. This structured information allows for easy reference to contact details for these military entities."""
"""batch_req_DWfWSkuqNq6Ys1jqS8ciUoc5""","""prompt_3""","""The table contains information about three branches of the United States military, specifically the U.S. Navy, U.S. Air Force, and U.S. Army, along with their respective addresses. The U.S. Navy's address is listed as the Department of the Navy, Chief of Naval Operations, N980A, NAATSEA, located at 2000 Navy Pentagon (5D453) in Washington, D.C., with the postal code 20350−2000. The U.S. Air Force's address is given as HQ AFFSA, situated at 5316 S. Douglas Blvd, Bldg 8400, Room 232, in Oklahoma City, OK, with the postal code 73150. Lastly, the U.S. Army's address is noted as the Director, USAASA (MOAS−AS), at 9325 Gunston Road, Suite N319, in Ft. Belvoir, VA, with the postal code 22060−5582. Each address provides a detailed location for the respective branch's headquarters or main office, indicating specific departments or offices within larger complexes."""
"""batch_req_bIvlPpHSFNX1puN5wDUnOx1O""","""prompt_4""","""The table presents a collection of abbreviations alongside their corresponding meanings, offering a glimpse into various aviation and aeronautical terminologies. The abbreviation ""AAR"" is associated with two different meanings: ""Adapted arrival route"" and ""Airport arrival rate,"" indicating its use in different contexts. ""AC"" stands for ""Advisory Circular,"" while ""ACC"" refers to ""Area Control Center,"" both of which are crucial in aviation operations. The abbreviation ""ACE−IDS"" is expanded to ""ASOS Controller Equipment− Information Display System,"" highlighting its role in information management. ""ACL"" is short for ""Aircraft list,"" and ""ACLS"" denotes the ""Automatic Carrier Landing System,"" both essential for aircraft operations. ""ADAR"" is an abbreviation for ""Adapted departure arrival route,"" and ""ADC"" stands for ""Aerospace Defense Command,"" reflecting their specific functions in aviation. The term ""ADIZ"" is pronounced as ""AY DIZ"" and refers to the ""Air Defense Identification Zone,"" a critical area for national security. ""ADR"" is the ""Adapted departure route,"" while ""ADS"" stands for ""Automatic Dependent Surveillance,"" a key component in modern air traffic management. The abbreviations ""ADS−B"" and ""ADS−C"" represent ""Automatic Dependent Surveillance−Broadcast"" and ""Automatic Dependent Surveillance−Contract,"" respectively, both of which are advancements in surveillance technology. ""AFP"" is the ""Airspace Flow Program,"" and ""AIDC"" stands for ""ATS Interfacility Data Communications,"" both of which facilitate efficient airspace management. ""AIM"" refers to the ""Aeronautical Information Manual,"" a vital resource for pilots and aviation professionals. ""AIRMET"" is an abbreviation for ""Airmen’s meteorological information,"" providing essential weather updates. ""ALDARS"" stands for the ""Automated Lightning Detection and Reporting System,"" crucial for weather monitoring. ""ALERFA"" is the ""Alert phase code (Alerting Service),"" and ""ALNOT"" is an ""Alert notice,"" both important for emergency response. ""ALS"" refers to the ""Approach Light System,"" aiding in aircraft landing. ""ALTRV"" stands for ""Altitude reservation,"" a term used in air traffic control. The abbreviation ""AM"" has a complex meaning related to ambiguity in target positioning, specifically ""Ambiguity−A disparity greater than a locally adapted distance exists between the position declared for a target by MEARTS and another facility’s computer declared position during interfacility handoff."" ""AMASS"" is the ""Airport Movement Area Safety System,"" enhancing airport safety. Another instance of ""AMB"" refers to a similar ambiguity issue, this time involving ""STARS"" instead of ""MEARTS."" ""AMVER"" stands for the ""Automated Mutual Assistance Vessel Rescue System,"" a maritime safety initiative. ""ANG"" is the ""Air National Guard,"" a military reserve force. Lastly, ""APR"" is the ""ATC preferred route,"" indicating a route preferred by air traffic control for efficiency."""


### Chunking

In [4]:
table_text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50)

In [5]:
table_text_chunks = table_text_splitter.create_documents(df_tables["content"])
table_text_chunks = [tc.page_content.replace("\n", " ") for tc in table_text_chunks]
print(f"Total table text chunks: {len(table_text_chunks)}")

Total table text chunks: 342


In [6]:
print(table_text_chunks[0:5])

['The table consists of four columns, each with a distinct heading, although one of the columns lacks a defined name. The first column is labeled "CHANGE\\nTO\\nBASIC" and contains a series of entries, most of which are either `None` or empty strings, indicating a lack of data or changes in this category. The second column, titled "SUPPLEMENTS," is filled entirely with empty strings, suggesting that no supplementary information is provided for any of', 'supplementary information is provided for any of the entries. The third column is unnamed, represented by a `None` key, and similarly contains only empty strings, further emphasizing the absence of data or additional details. The final column, "OPTIONAL," mirrors the pattern seen in the "CHANGE\\nTO\\nBASIC" column, with entries predominantly consisting of `None` or empty strings, indicating that optional information is either not applicable or', 'optional information is either not applicable or not provided. Overall, the table appears 

### Input Token Cost

In [7]:
tokens = []
for text in table_text_chunks:
    num_tokens = get_num_tokens_from_string(text, encoding_name="gpt-4o")
    tokens.append(num_tokens)
total_tokens = sum(tokens)

total_input_token_cost = calculate_token_pricing(total_tokens, TEXT_EMBEDDING_3_SMALL_PRICE_PER_MILLION_INPUT_TOKENS)
print(f"The cost for {total_tokens} input tokens is about ${total_input_token_cost}.")

total_input_token_batch_cost = calculate_token_pricing(
    total_tokens, TEXT_EMBEDDING_3_BATCH_PRICE_PER_MILLION_INPUT_TOKENS
)
print(f"The batch cost for {total_tokens} input tokens is about ${total_input_token_batch_cost}.")

The cost for 27903 input tokens is about $0.0006.
The batch cost for 27903 input tokens is about $0.0003.


### OpenAI Batch

In [8]:
write_requests_to_jsonl(
    table_text_chunks,
    base_output_path="../data/batch_table_text_embeddings",
    batch_request_file="table_text_embeddings",
    create_request=create_embedding_request,
)

Written 342 requests to ../data/batch_table_text_embeddings/table_text_embeddings_1.jsonl.
Total requests written across all files: 342


In [None]:
# create_openai_batch_process(
#     api_key=os.environ.get("OPENAI_API_KEY"),
#     batch_request_file="../data/batch_table_text_embeddings/table_text_embeddings_1.jsonl",
#     batch_response_file="../data/batch_table_text_embeddings/batch_response_table_text_embeddings_1.json",
#     endpoint="/v1/embeddings",
#     description="table text embeddings 1"
#     )

In [None]:
# check_openai_batch_status(
#     api_key=os.environ.get("OPENAI_API_KEY"),
#     batch_response_file="../data/batch_table_text_embeddings/batch_response_table_text_embeddings_1.json",
#     output_file="../data/batch_table_text_embeddings/batch_output_table_text_embeddings_1.jsonl"
#     )

### Output

In [9]:
df_table_embeddings = read_batch_embeddings_output_jsonl_to_polars(
    "../data/batch_table_text_embeddings/batch_output_table_text_embeddings_1.jsonl"
)
df_table_embeddings.head()

id,custom_id,embedding
str,str,list[f64]
"""batch_req_ggRKLe5fAIPEpfJXCdkpow9N""","""embedding_0""","[-0.025043, 0.035453, … -0.017601]"
"""batch_req_s5c4REzZvy3I2aZneUwOHE1P""","""embedding_1""","[0.017219, 0.014126, … 0.003443]"
"""batch_req_9OQjWJbWQgQoAQ8mvZ1y6Uck""","""embedding_2""","[-0.034105, 0.015981, … 0.026129]"
"""batch_req_3jJduJLsoDchcgISNbQjYHq6""","""embedding_3""","[-0.008618, 0.037194, … -0.013461]"
"""batch_req_8foQcerAKC5jEXxD4xDZsPaH""","""embedding_4""","[0.0423, 0.020409, … 0.007719]"


# Doc Text

In [10]:
FILE_PATH = "../data/2023-20-04_JO_7110.65AA.pdf"

In [11]:
df_doc_text = analyze_pdf_text_content(FILE_PATH)
print(df_doc_text.height)
df_doc_text.head()

689


page_number,contains_text,extracted_text
i64,i64,str
1,1,"""ORDER JO 7110.65AA Air Traffic Organization Policy Effective Date: April 20, 2023 SUBJ: Air Traffic Control This order prescribes air tra ffic control procedures and phras eology for use by personnel providing air traffic control services. Controll ers are required to be familiar with the provisi ons of this order that pertain to their operational respon sibilities and to exercise their be st judgment if they encounter situations not covered by it. Natasha A. Durkins Vice President, Mission Support Services Air Traffic Organization Distribution: ZAT-710, ZAT-464 Initiated By: AJV-0 Vice President, Mission Support Services NATASHA A. DURKINSDigitally signed by NATASHA A. DURKINS Date: 2023.03.03 12:50:09 -05'00'"""
2,1,"""RECORD OF CHANGES DIRECTIVE NO. JO 7110.65AA CHANGE TO BASIC SUPPLEMENTS OPTIONAL CHANGE TO BASIC SUPPLEMENTS OPTIONAL FAA Form 1320−5 (6−80) USE PREVIOUS EDITION"""
3,1,"""4/20/23 JO 7110.65AA Explanation of Changes Basic Direct questions through appropriate facility/service center office staff to the Office of Primary Interest (OPI) a. 1−2−6. ABBREVIATIONS 5−1−2. ATC SURVEILLANCE SOURCE USE 5−5−4. MINIMA 5−5−7. PASSING OR DIVERGING 5−5−9. SEPARATION FROM OBSTRUCTIONS 5−13−8. CONTROLLER INITIA TED COAST TRACKS This change adds operational guidance associated with the use of the Standard Terminal Automation Replacement System (STARS) Multi −Sensor Mode when the sensor environment does not support the use of FUSION and the use of single sensor does not provide suf ficient surveillance coverage. Enhanced Backup Surveillance (EBUS) has been decommissioned throughout the National Airspace System (NAS) and references have been removed. b. 1−2−6. ABBREVIA TIONS 5−3−4. TERMINAL AUT OMATION SYSTEMS IDENTIFICATION METHODS 5−4−6. RECEIVING CONTROLLER HANDOFF This change reinstates “AM” to paragraphs 5 −3−4 and 5−4−6 and updates 5 −4−6 to include Micro −En Route Automated Radar T racking System (MEARTS). This change also adds the definition of “AM” to 1 −2−6, and updates the current definition of AMB to remove the 2 −mile disparity value as it is locally adaptable and not uniform across all facilities. c. 2−1−4. OPERA TIONAL PRIORITY 2−4−20. AIRCRAFT IDENTIFICATION 9−2−17. SAMP FLIGHTS This change modifies the statement in paragraph 2 −4−20 that the “SAMP” call sign will be followed by a three− digit flight number instead of specifying the last three digits of the aircraft’s tail number. Other general edits and reference changes are made to paragraphs 2− 1−4, 2− 4−20, and 9− 2−17 for clarity. d. 2−1−4. OPERA TIONAL PRIORITY 9−2−22. OPEN SKIES TREATY AIRCRAFT This change removes all documentation and references to Open Skies Treaty flights in paragraph 2 −1−4. This change deletes paragraph 9− 2−22, Open Skies Treaty Aircraft. e. 2−1−27. PILOT DEVIA TION NOTIFICATION This change renames the paragraph title and also adds a note to the paragraph identifying “Brasher Notification or Brasher Warning” as terms sometimes used to reference the phraseology for notifying a pilot of a possible pilot deviation. f. 2−6−4. ISSUING WEA THER AND CHAFF AREAS 5−4−10. EN ROUTE FOUR TH LINE DATA BLOCK USAGE This change harmonizes the language in FAA Order JO 71 10.65 2− 6−4k and 5− 4−10f Note 2 and Note 3 in which it explains the use of /NA V AID, /waypoint, and /F entries in the 4th line of the Full Data Block (FDB) when an aircraft has been cleared to deviate for weather. Additionally, the designated characters used for coordinating deviations between two specified headings in FAA Order JO 7110.65 5 −4−10f were changed to eliminate ambiguity. g. 2−6−6. HAZARDOUS IN FLIGHT WEATHER ADVISORY This change acknowledges that controllers are no longer required to disseminate Airmen’s Meteorological Information (AIRMET) over the contiguous United States (CONUS). It updates the language in FAA Order Explanation of Changes E of C− 1"""
4,1,"""JO 7110.65AA 4/20/23 JO 7110.65, Air Traffic Control, paragraph 2 −6−6, to reflect the change. ATC facilities in the CONUS will no longer receive AIRMET advisories to broadcast; operators have other methods of receiving AIRMET information over the CONUS. h. 5−2−7. VFR CODE ASSIGNMENTS This change adds a Note advising that data blocks displaying beacon code 1203 represent the lead aircraft of a Visual Flight Rules (VFR) standard formation flight not receiving ATC services. i. 5−4−10. EN ROUTE FOUR TH LINE DATA BLOCK USAGE This change modifies FAA Order JO 7110.65, 5 −4−10g and 5− 4−10h, to accommodate the current En Route Automation Modernization (ERAM) display methods of assigned speed data. The use of the designation characters “M”, “S”, “+”, “−” and “.” will be included as acceptable methods to display speed assignment utilizing the four character limit for speed entries in the fourth line of the Full Data Block (FDB). j. 6−4−3. MINIMA ON OPPOSITE COURSES 6−5−4. MINIMA ALONG OTHER THAN ESTABLISHED AIRWAYS OR ROUTES 6−5−5. RNA V MINIMA − DIVERGING/CROSSING COURSES This change replaces the term “expanded route” in paragraphs 6 −4−3(c) and (d) and 6 −5−5(b) with language that will explicitly define when to apply the 18 mile separation standard. Also Figure 6 −5−4 will be corrected. k. 8−7−4. LATERAL SEPARATION 8−8−4. LATERAL SEPARATION 8−9−4. LATERAL SEPARATION 8−10−4. LATERAL SEPARATION This change reduces the lateral separation minima from 30 NM to 23 NM in the oceanic airspaces of Oakland ARTCC, New York ARTCC, and portions of Anchorage ARTCC. This change also clarifies where 50 NM separation is used by removing the references to airspace that has been designated as Offshore Airspace from 8−7−4 subparagraph b, and 8 −8−4 subparagraph b, by defining where it is used in New Y ork’s and San Juan’s airspace. This change cancels and incorporates N JO 7110.788, which was effective December 21, 2022. l. 9−2−12. LAW ENFORCEMENT OPERATIONS This change retitles paragraph 9 −2−12, reformats the paragraph by reorganizing and modifying the existing subparagraphs a. and b., and adds procedures required to ensure the effective use of sensitive government mission beacon codes and call signs. This change also eliminates outdated terminology and aligns the language with updates to other orders. This change cancels and incorporates N JO 7110.787, which was effective December 01, 2022 . m. 13−1−1. DESCRIPTION This change amends “flight plan data” to “current plan data” which is a more accurate depiction of how the En Route Decision Support Tool (EDST) calculates trajectories and predicts conflicts. It also adds a NOTE to define what is meant by “current plan.” n. Editorial Changes Editorial changes include updating references to re −numbered paragraphs, changing “operational supervisor” to “operations supervisor” where applicable, making the acronym DoD consistent throughout the order, changing “runway extended” to “extended runway” in paragraph 7 −4−4 to make consistent with the rest of the order, and deleting the obsolete ARTS to non −ARTS transition info in paragraph 2 −3−4. o. Entire publication Additional editorial/format changes were made where necessary. Revision bars were not used because of the insignificant nature of these changes. E of C− 2"""
5,1,"""4/20/23 JO 7110.65AA Table of Contents Chapter 1. General Section 1. Introduction Paragraph Page 1−1−1. PURPOSE OF THIS ORDER ............................................ 1−1−1 1−1−2. AUDIENCE .......................................................... 1−1−1 1−1−3. WHERE TO FIND THIS ORDER ........................................ 1−1−1 1−1−4. WHAT THIS ORDER CANCELS ........................................ 1−1−1 1−1−5. EXPLANATION OF CHANGES ......................................... 1−1−1 1−1−6. EFFECTIVE DATES AND SUBMISSIONS FOR CHANGES .................. 1−1−1 1−1−7. DELIVERY DATES ................................................... 1−1−2 1−1−8. RECOMMENDATIONS FOR PROCEDURAL CHANGES .................... 1−1−2 1−1−9. REQUESTS FOR INTERPRETATIONS OR CLARIFICATIONS TO THIS ORDER 1 −1−2 1−1−10. PROCEDURAL LETTERS OF AGREEMENT (LOA) ....................... 1−1−3 1−1−11. CONSTRAINTS GOVERNING SUPPLEMENTS AND PROCEDURAL DEVIATIONS ....................................................... 1−1−3 1−1−12. SAFETY MANAGEMENT SYSTEM (SMS) .............................. 1−1−3 1−1−13. REFERENCES TO FAA NON−AIR TRAFFIC ORGANIZATIONS ............ 1−1−4 1−1−14. DISTRIBUTION ..................................................... 1−1−4 Section 2. Terms of Reference 1−2−1. WORD MEANINGS ................................................... 1−2−1 1−2−2. COURSE DEFINITIONS ............................................... 1−2−2 1−2−3. NOTES ............................................................. 1−2−3 1−2−4. REFERENCES ....................................................... 1−2−3 1−2−5. ANNOTATIONS ...................................................... 1−2−3 1−2−6. ABBREVIATIONS .................................................... 1−2−4 Chapter 2. General Control Section 1. General 2−1−1. ATC SERVICE ....................................................... 2−1−1 2−1−2. DUTY PRIORITY ..................................................... 2−1−2 2−1−3. PROCEDURAL PREFERENCE .......................................... 2−1−2 2−1−4. OPERATIONAL PRIORITY ............................................ 2−1−2 2−1−5. EXPEDITIOUS COMPLIANCE ......................................... 2−1−4 2−1−6. SAFETY ALERT ...................................................... 2−1−5 2−1−7. INFLIGHT EQUIPMENT MALFUNCTIONS ............................... 2−1−6 2−1−8. MINIMUM FUEL ..................................................... 2−1−6 2−1−9. REPORTING ESSENTIAL FLIGHT INFORMATION ........................ 2−1−6 2−1−10. NAVAID MALFUNCTIONS ........................................... 2−1−6 2−1−11. USE OF MARSA ..................................................... 2−1−7 2−1−12. MILITARY PROCEDURES ............................................ 2−1−8 2−1−13. FORMATION FLIGHTS ............................................... 2−1−8 2−1−14. COORDINATE USE OF AIRSPACE ..................................... 2−1−9 2−1−15. CONTROL TRANSFER ............................................... 2−1−10 Table of Contents i"""


### Chunking

In [12]:
doc_text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50)

In [13]:
doc_text_chunks = doc_text_splitter.create_documents(df_doc_text["extracted_text"])
doc_text_chunks = [tc.page_content.replace("\n", " ") for tc in doc_text_chunks]
print(f"Total table text chunks: {len(doc_text_chunks)}")

Total table text chunks: 4536


In [14]:
print(doc_text_chunks[0:5])

['ORDER  JO 7110.65AA  Air Traffic Organization Policy  Effective Date:  April 20, 2023  SUBJ: Air Traffic Control  This order prescribes air tra ffic control procedures and phras eology for use by personnel providing  air traffic control services. Controll ers are required to be familiar with the provisi ons of this order that  pertain to their operational respon sibilities and to exercise their be st judgment if they encounter', "situations not covered by it.  Natasha A. Durkins  Vice President, Mission Support Services Air Traffic Organization  Distribution: ZAT-710, ZAT-464 Initiated By: AJV-0  Vice President, Mission Support Services NATASHA A.  DURKINSDigitally signed by NATASHA A. DURKINS Date: 2023.03.03  12:50:09 -05'00'", 'RECORD OF CHANGES DIRECTIVE NO.      JO 7110.65AA  CHANGE  TO  BASIC SUPPLEMENTS  OPTIONAL CHANGE  TO  BASIC SUPPLEMENTS  OPTIONAL  FAA Form 1320−5 (6−80) USE PREVIOUS EDITION', '4/20/23 JO 7110.65AA  Explanation of Changes  Basic  Direct questions through 

### Input Token Cost

In [16]:
tokens = []
for text in doc_text_chunks:
    num_tokens = get_num_tokens_from_string(text, encoding_name="gpt-4o")
    tokens.append(num_tokens)
total_tokens = sum(tokens)

total_input_token_cost = calculate_token_pricing(total_tokens, TEXT_EMBEDDING_3_SMALL_PRICE_PER_MILLION_INPUT_TOKENS)
print(f"The cost for {total_tokens} input tokens is about ${total_input_token_cost}.")

total_input_token_batch_cost = calculate_token_pricing(
    total_tokens, TEXT_EMBEDDING_3_BATCH_PRICE_PER_MILLION_INPUT_TOKENS
)
print(f"The batch cost for {total_tokens} input tokens is about ${total_input_token_batch_cost}.")

The cost for 419656 input tokens is about $0.0084.
The batch cost for 419656 input tokens is about $0.0042.


### OpenAI Batch

In [17]:
write_requests_to_jsonl(
    doc_text_chunks,
    base_output_path="../data/batch_doc_text_embeddings",
    batch_request_file="doc_text_embeddings",
    create_request=create_embedding_request,
)

Written 4536 requests to ../data/batch_doc_text_embeddings/doc_text_embeddings_1.jsonl.
Total requests written across all files: 4536


In [25]:
# create_openai_batch_process(
#     api_key=os.environ.get("OPENAI_API_KEY"),
#     batch_request_file="../data/batch_doc_text_embeddings/doc_text_embeddings_1.jsonl",
#     batch_response_file="../data/batch_doc_text_embeddings/batch_response_doc_text_embeddings_1.json",
#     endpoint="/v1/embeddings",
#     description="doc text embeddings 1"
#     )

In [29]:
check_openai_batch_status(
    api_key=os.environ.get("OPENAI_API_KEY"),
    batch_response_file="../data/batch_doc_text_embeddings/batch_response_doc_text_embeddings_1.json",
    output_file="../data/batch_doc_text_embeddings/batch_output_doc_text_embeddings_1.jsonl",
)

OpenAI batch job batch_66fdec4a0754819080514ec411bab87a has status: finalizing


### Output

In [None]:
df_doc_text_embeddings = read_batch_embeddings_output_jsonl_to_polars(
    "../data/batch_doc_text_embeddings/batch_output_doc_text_embeddings_1.jsonl"
)
df_doc_text_embeddings.head()