In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
from sqlalchemy import create_engine, select, values, update, and_, exists, text
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
from dotenv import load_dotenv
import os
import requests
import json
import re
import pandas as pd
import pendulum

import tiktoken


In [3]:

load_dotenv()

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

Get all parsed 

In [5]:
with SessionLocal() as db:
    stmt = text("""select text from resource_links 
                    where 
                    notice_id IN (select id from notices where \"postedDate\" = '2024-03-13') 
                    and 
                    text != 'unparsable' 
                    and
                    text is not null""") 
    results = db.execute(stmt).scalars().all()

In [6]:
len(results)

0

In [7]:
all_texts = " ".join(results)
len(all_texts)

0

In [8]:
def num_tokens_in_corpus(input:str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

In [9]:
def est_costs(price_input_mil: float = 10.0, price_output_mil: float = 30.0, len_input: int = 0) -> float:
    price_per_token_input = price_input_mil / 1000000
    price_per_token_output = price_output_mil / 1000000
    print(f"Cost of input: {len_input * price_per_token_input}; Cost of output: {len_input * price_per_token_output}")

In [10]:
num_tokens = num_tokens_in_corpus(all_texts)

| Model                 | Input Cost | Output Cost |
| --------------------- | ---------- | ----------- |
| gpt-4                 | $30.00     | $60.00      |
| gpt-4-32k             | $60.00     | $120.00     |
| gpt-3.5-turbo-0125    | $0.50      | $1.50       |
| gpt-3.5-turbo-instruct| $1.50      | $2.00       |
| haiku          | $0.25      | $1.25       |
| sonnet | $3.00      | $15.00      |
| opus | $5.00      | $75.00      |

In [11]:
def gpt_4(num_tokens: int) -> str:
    return est_costs(30, 60, num_tokens)

In [12]:
def gpt_3_5(num_tokens: int) -> str:
    return est_costs(.50, 1.5, num_tokens)

In [13]:
def haiku(num_tokens: int) -> str:
    return est_costs(.25, 1.25, num_tokens)

In [14]:
def sonnet(num_tokens: int) -> str:
    return est_costs(3, 15, num_tokens)

In [15]:
gpt_4(num_tokens), gpt_3_5(num_tokens), haiku(num_tokens), sonnet(num_tokens)

Cost of input: 0.0; Cost of output: 0.0
Cost of input: 0.0; Cost of output: 0.0
Cost of input: 0.0; Cost of output: 0.0
Cost of input: 0.0; Cost of output: 0.0


(None, None, None, None)

Cost to parse file heads for relevancy - 

In [16]:
len(results)

0

Get the file heads if the file is less than 100000 chars long

In [17]:
result_heads = [result[:2000] for result in results if len(result) < 100000]

In [18]:
len(result_heads)

0

In [19]:
all_result_heads = " ".join(result_heads)

In [20]:
num_head_tokens = num_tokens_in_corpus(all_result_heads)
num_head_tokens

0

In [21]:
gpt_3_5(num_head_tokens), haiku(num_head_tokens)

Cost of input: 0.0; Cost of output: 0.0
Cost of input: 0.0; Cost of output: 0.0


(None, None)

In [28]:
prior_date = pendulum.now().subtract(days=1).strftime('%Y-%m-%d')
prior_date

'2024-03-14'

In [29]:
with SessionLocal() as db:
    stmt = text("""select text, length(text) as len from resource_links 
                    where 
                    notice_id IN (select id from notices where \"postedDate\" = :prior_date) 
                    and 
                    text != 'unparsable' 
                    and
                    text != 'adobe-error'
                    and
                    text != 'encoding-error'
                    and
                    text is not null
                    order by len desc
                """) 
    results = db.execute(stmt, params={"prior_date": prior_date}).all()

In [32]:
results[0]

('Contract No. W912DS21D0001\nTask Order No. W912DS23F0011\n\nRADIAL FORGE\nINFRASTRUCTURE\nWATERVLIET ARSENAL, NY\nSPECIFICATIONS – VOLUME 2 OF 3\nFIN ... (1377409 characters truncated) ... e to the equipment on\na regular and emergency basis during the warranty period of the contract.\n-- End of Section --\n\nSECTION 23 81 00\n\nPage 20', 1329159)

ID `236220` is Commercial and Institutional Building construction, the category that usually has the most frequent additions

In [38]:
with SessionLocal() as db:
    stmt = text("""select text from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = :prior_date)
                    and 
                    text != 'unparsable' 
                    and
                    text != 'adobe-error'
                    and
                    text != 'encoding-error'
                    and
                    text is not null
                """) 
    results = db.execute(stmt, params={"prior_date": prior_date}).scalars().all()

In [39]:
len(results)

193

In [40]:
all_texts = " ".join(results)
len(all_texts)

8849300

In [41]:
num_tokens = num_tokens_in_corpus(all_texts)

In [42]:
num_tokens

2089979

In [43]:
gpt_3_5(num_tokens), haiku(num_tokens)

Cost of input: 1.0449895; Cost of output: 3.1349685000000003
Cost of input: 0.52249475; Cost of output: 2.6124737500000004


(None, None)

In [52]:
with SessionLocal() as db:
    stmt = text("""select text from (select text, length(text) as len from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = :prior_date))
                    where len < 150000
                    order by len desc 
                """) 
    results = db.execute(stmt, params={"prior_date": prior_date}).scalars().all()

In [53]:
all_texts = " ".join(results)
len(all_texts)

2201440

In [54]:
num_tokens = num_tokens_in_corpus(all_texts)

In [55]:
gpt_3_5(num_tokens), haiku(num_tokens)

Cost of input: 0.2811065; Cost of output: 0.8433195
Cost of input: 0.14055325; Cost of output: 0.70276625


(None, None)

Selecting just the first chars of a text to see if it's a solicitation notice, which is all that I want to condense for the time being

In [60]:
with SessionLocal() as db:
    stmt = text("""select id, text from resource_links 
                    where 
                    notice_id IN (select id from notices where \"postedDate\" = :prior_date) 
                    and 
                    text != 'unparsable' 
                    and
                    text is not null""") 
    results = db.execute(stmt, params={"prior_date": prior_date}).all()

In [61]:
result_heads= [(idx, result[:2000]) for idx, result in results]

In [62]:
result_heads[0]

(1010,
 'SOLICITATION, OFFER,\nAND AWARD\n\n1. SOLICITATION NO.\nW519TC24B2002\n\n(Construction, Alteration, or Repair)\n\n2. TYPE OF SOLICITATION\nX SEALED BID\n\n(IFB)\n\n3. DATE ISSUED\n\nPAGE OF PAGES\n\n30-Jan-2024\n\n1 OF\n\nNEGOTIATED (RFP)\n\n39\n\nIMPORTANT - The "offer" section on the reverse must be fully com pleted by offeror.\n5. REQUISITION/PURCHASE REQUEST NO.\n\n4. CONTRACT NO.\n\n7. ISSUED BY\n\nCODE\n\n8. ADDRESS OFFER TO\n\nW519TC\n\nARMY CONTRACTING COMMAND - ROCK ISLAND\n3055 RODMAN AVE\nROCK ISLAND IL 61299\n\nTEL:\n\n(If Other Than Item 7) CODE\n\nSee Item 7\n\nFAX:\n\n9. FOR INFORMATION\nCALL:\n\n6. PROJECT NO.\n\nTEL:\n\nFAX:\n\nA. NAME\n\nB. TELEPHONE NO.\n\nGREG BROWN\n\n309-782-4997\n\n(Include area code)\n\n(NO COLLECT CALLS)\n\nSOLICITATION\nNOTE: In sealed bid solicitations "offer" and "offeror" mean "bid" and "bidder".\n10. THE GOVERNMENT REQUIRES PERFORMANCE OF THE WORK DESCRIBED IN THESE DOCUMENTS\n\n(Title, identifying no., date):\n\nAttachment 001 - 

In [63]:
combined_heads = " ".join([result_head for _, result_head in result_heads])

In [64]:
tokens_heads = num_tokens_in_corpus(combined_heads)
tokens_heads

489562

In [65]:
gpt_3_5(tokens_heads), haiku(tokens_heads)

Cost of input: 0.244781; Cost of output: 0.734343
Cost of input: 0.1223905; Cost of output: 0.6119525


(None, None)

In [66]:
cleaned_heads = [(id, resource_text.encode('utf-8', 'ignore').decode('utf-8')) for id, resource_text in result_heads] 

In [67]:
sample_heads = cleaned_heads[:50]

In [68]:
sample_heads[19][1]

'Attachment 4: Page 2\n\n\n\nVendor Experience Sheet\n\nCONTRACT IDENTIFICATION\n\n\n\nContractor Name:\n\n\n\nContract Number:\n\nContractor role: \n\n[i.e., Prime, Subcontractor, or Joint Venture]\n\nCooperative entities:\n\n[i.e., Prime, Subcontractor, or Joint Venture]\n\n\n\nDates of Contract Performance:\n\nAddress, City, and State\n\nof contract performance:\n\nTotal Contract Value: \n\n(including options)\n\nContract Requirements: [Describe all services provided under the contract and any special/unique contract requirements. Briefly describe any problems encountered and how they were resolved]\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCUSTOMER IDENTIFICATION (Please Include a Point of Contact for the Customer for this reference, if available)\n\n\n\nName\n\n\n\nTitle\n\n\n\nAgency/Company\n\n\n\nAddress\n\n\n\nTelephone\n\n\n\nE-mail\n\n\n\n\n\nVendor Experience Sheet\n\nCONTRACT IDENTIFICATION\n\n\n\nContractor Name\n\n\n\nContract Number\n\nC

In [69]:
import instructor
from openai import OpenAI
import logging
from pydantic import BaseModel, Field
from typing_extensions import Optional, List
from rich import print as rprint



logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


In [70]:
%load_ext rich

In [71]:
class DocType(BaseModel):
    solicitation: bool = Field(..., description="Indicates if the document is a solicitation (RFP)")
    amendment: bool = Field(..., description="Indicates if the document is an amendment")
    other: Optional[str] = Field(None, description="Type of document if it's neither solicitation nor amendment, up to the discretion of the LLM")

In [72]:
client = instructor.patch(OpenAI())

2024-03-15 08:57:48,538 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-03-15 08:57:48,543 - DEBUG - load_verify_locations cafile='/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/certifi/cacert.pem'
2024-03-15 08:57:48,589 - DEBUG - Patching `client.chat.completions.create` with mode=<Mode.TOOLS: 'tool_call'>


In [88]:
def classify_document_type(document_head: str) -> DocType:
    return client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[
            {
                "role": "user",
                "content": f"""Given the information at the beginning of a document, classify it as a solicitation 
                (also known as a Request for Proposal (RFP)), an amendment, or identify it as another category.           
                
                Solicitions often contain text like `Solicitation`, `Offer`, and/or `Award`. Amendments often contain the
                term `Amendment of Solicitation`
                
                Please apply a high standard of confidence before categorizing a document as either a solicitation or an amendment. 
                If you do not find compelling evidence within the beginning of the document, you may classify it under a different category. 
                Here is the beginning of the document for your analysis: {document_head}""",
            }
        ],
        response_model=DocType,
    )

In [89]:
small_batch = sample_heads[:10]

In [90]:
small_batch


[1m[[0m
    [1m([0m
        [1;36m1010[0m,
        [32m'SOLICITATION, OFFER,\nAND AWARD\n\n1. SOLICITATION NO.\nW519TC24B2002\n\n[0m[32m([0m[32mConstruction, Alteration, or Repair[0m[32m)[0m[32m\n\n2. TYPE OF SOLICITATION\nX SEALED BID\n\n[0m[32m([0m[32mIFB[0m[32m)[0m[32m\n\n3. DATE ISSUED\n\nPAGE OF PAGES\n\n30-Jan-2024\n\n1 OF\n\nNEGOTIATED [0m[32m([0m[32mRFP[0m[32m)[0m[32m\n\n39\n\nIMPORTANT - The "offer" section on the reverse must be fully com pleted by offeror.\n5. REQUISITION/PURCHASE REQUEST NO.\n\n4. CONTRACT NO.\n\n7. ISSUED BY\n\nCODE\n\n8. ADDRESS OFFER TO\n\nW519TC\n\nARMY CONTRACTING COMMAND - ROCK ISLAND\n3055 RODMAN AVE\nROCK ISLAND IL 61299\n\nTEL:\n\n[0m[32m([0m[32mIf Other Than Item 7[0m[32m)[0m[32m CODE\n\nSee Item 7\n\nFAX:\n\n9. FOR INFORMATION\nCALL:\n\n6. PROJECT NO.\n\nTEL:\n\nFAX:\n\nA. NAME\n\nB. TELEPHONE NO.\n\nGREG BROWN\n\n309-782-4997\n\n[0m[32m([0m[32mInclude area code[0m[32m)[0m[32m\n\n[0m[32m([0m[32m

In [91]:

sample_classifications = [(id, resource_text, classify_document_type(text)) for id, resource_text in small_batch]

2024-03-15 09:03:03,862 - DEBUG - Instructor Request: mode.value='tool_call', response_model=<class '__main__.DocType'>, new_kwargs={'model': 'gpt-3.5-turbo-0125', 'messages': [{'role': 'user', 'content': 'Given the information at the beginning of a document, classify it as a solicitation \n                (also known as a Request for Proposal (RFP)), an amendment, or identify it as another category.           \n                \n                Solicitions often contain text like `Solicitation`, `Offer`, and/or `Award`. Amendments often contain the\n                term `Amendment of Solicitation`\n                \n                Please apply a high standard of confidence before categorizing a document as either a solicitation or an amendment. \n                If you do not find compelling evidence within the beginning of the document, you may classify it under a different category. \n                Here is the beginning of the document for your analysis: <function text at 0x7f792

In [92]:
for id, resource_text, classification in sample_classifications:
    rprint(f"[bright_cyan]{resource_text}")
    rprint(f"[bright_magenta]Sol: {classification.solicitation}")
    rprint(f"[bright_yellow]Amend: {classification.amendment}")

In [93]:
sample_classifications[0]


[1m([0m
    [1;36m1010[0m,
    [32m'SOLICITATION, OFFER,\nAND AWARD\n\n1. SOLICITATION NO.\nW519TC24B2002\n\n[0m[32m([0m[32mConstruction, Alteration, or Repair[0m[32m)[0m[32m\n\n2. TYPE OF SOLICITATION\nX SEALED BID\n\n[0m[32m([0m[32mIFB[0m[32m)[0m[32m\n\n3. DATE ISSUED\n\nPAGE OF PAGES\n\n30-Jan-2024\n\n1 OF\n\nNEGOTIATED [0m[32m([0m[32mRFP[0m[32m)[0m[32m\n\n39\n\nIMPORTANT - The "offer" section on the reverse must be fully com pleted by offeror.\n5. REQUISITION/PURCHASE REQUEST NO.\n\n4. CONTRACT NO.\n\n7. ISSUED BY\n\nCODE\n\n8. ADDRESS OFFER TO\n\nW519TC\n\nARMY CONTRACTING COMMAND - ROCK ISLAND\n3055 RODMAN AVE\nROCK ISLAND IL 61299\n\nTEL:\n\n[0m[32m([0m[32mIf Other Than Item 7[0m[32m)[0m[32m CODE\n\nSee Item 7\n\nFAX:\n\n9. FOR INFORMATION\nCALL:\n\n6. PROJECT NO.\n\nTEL:\n\nFAX:\n\nA. NAME\n\nB. TELEPHONE NO.\n\nGREG BROWN\n\n309-782-4997\n\n[0m[32m([0m[32mInclude area code[0m[32m)[0m[32m\n\n[0m[32m([0m[32mNO COLLECT CALLS[0m[

In [94]:
sample_solicitations = [(id, resource_text, doc) for id, resource_text, doc in sample_classifications if doc.solicitation]

Calc the percentage of how many docs from the original batch are solicitations. This number seems to jump around between `20` and `50` percent.

In [95]:
len(sample_solicitations) / len(sample_classifications)

[1;36m0.6[0m

In [96]:
sample_solicitations[0]


[1m([0m
    [1;36m2[0m,
    [32m'PRE-BID SITE VISIT ROSTER:\nW912HV23R0004 LXEZ1076699 Theater Aircraft Corrosion Control Prep Hangar\nKadena Air Base, Okinawa, Japan\n\nPerson Name:\nLAST First\n\n�\n\nTemp Base\nVehicle Info\n\' Access Reg? I\ni\nNo\n�\n- --,Okinawa 480 ch\'I 74-57\nNo\n\nCompany Name:\n!A to Z .J,\nAmerican Engineering\nrel\\ v�\nHl��._,shi\n-n_\no _,_r_,o\nC o_c�p_,\n--- - ---+_\n___\n___\nAmerican Engineering\nTAMA�mi\ni;:_or��t�!i�_n\nAmerican Engineering\nYOR/MfTISU, Yoshie\nCorporation\nTARLITO , Ai\n�\n\nNo\n\nNo\n\n0DA Corporati on �\n_\n_\n�\n__\n1\nTODA Corporation LJ�\n\nij WADA, Yuujl\n\nSONODA, H iraki\n\nNo\n\nNo\n\nTODA Corporatio\'V[0m[32m([0m[32m.1 \'!-!"\'!\' �\\\n-- 1\nKAMfNAGA, Michiyoshi- - TO�A Corporation r-1,I,\n\n�t\n\nKAWAMURA, Renaldo\n\nNihon Urban Devel. //ff/\nProject C orporation � ;\n\nSAKIYAMA, Souichiro\n\nHEXEL Works\n\nTSUTSUJ, Hirokazu\n\nHEXEL Works\n\nKUDA, Kenji\n\nHEXEL Works\n\n8 OKADA, Koichiro\nJohnson Controls\n-

In [97]:
tuple(sample_solicitations)


[1m([0m
    [1m([0m
        [1;36m2[0m,
        [32m'PRE-BID SITE VISIT ROSTER:\nW912HV23R0004 LXEZ1076699 Theater Aircraft Corrosion Control Prep Hangar\nKadena Air Base, Okinawa, Japan\n\nPerson Name:\nLAST First\n\n�\n\nTemp Base\nVehicle Info\n\' Access Reg? I\ni\nNo\n�\n- --,Okinawa 480 ch\'I 74-57\nNo\n\nCompany Name:\n!A to Z .J,\nAmerican Engineering\nrel\\ v�\nHl��._,shi\n-n_\no _,_r_,o\nC o_c�p_,\n--- - ---+_\n___\n___\nAmerican Engineering\nTAMA�mi\ni;:_or��t�!i�_n\nAmerican Engineering\nYOR/MfTISU, Yoshie\nCorporation\nTARLITO , Ai\n�\n\nNo\n\nNo\n\n0DA Corporati on �\n_\n_\n�\n__\n1\nTODA Corporation LJ�\n\nij WADA, Yuujl\n\nSONODA, H iraki\n\nNo\n\nNo\n\nTODA Corporatio\'V[0m[32m([0m[32m.1 \'!-!"\'!\' �\\\n-- 1\nKAMfNAGA, Michiyoshi- - TO�A Corporation r-1,I,\n\n�t\n\nKAWAMURA, Renaldo\n\nNihon Urban Devel. //ff/\nProject C orporation � ;\n\nSAKIYAMA, Souichiro\n\nHEXEL Works\n\nTSUTSUJ, Hirokazu\n\nHEXEL Works\n\nKUDA, Kenji\n\nHEXEL Works\n\n8 OKADA, Koichir

In [98]:
tuple([id for id, _, _ in sample_solicitations])

[1m([0m[1;36m2[0m, [1;36m3[0m, [1;36m7[0m, [1;36m29[0m, [1;36m26[0m, [1;36m25[0m[1m)[0m

In [99]:
help(text)

Help on function text in module sqlalchemy.sql.expression:

text(text, bind=None)
    Construct a new :class:`_expression.TextClause` clause,
    representing
    a textual SQL string directly.
    
    E.g.::
    
        from sqlalchemy import text
    
        t = text("SELECT * FROM users")
        result = connection.execute(t)
    
    The advantages :func:`_expression.text`
    provides over a plain string are
    backend-neutral support for bind parameters, per-statement
    execution options, as well as
    bind parameter and result-column typing behavior, allowing
    SQLAlchemy type constructs to play a role when executing
    a statement that is specified literally.  The construct can also
    be provided with a ``.c`` collection of column elements, allowing
    it to be embedded in other SQL expression constructs as a subquery.
    
    Bind parameters are specified by name, using the format ``:name``.
    E.g.::
    
        t = text("SELECT * FROM users WHERE id=:user_id

In [100]:
with SessionLocal() as db:
    stmt = text("""select * from resource_links 
                    where
                    id in :ids""") 
    results = db.execute(stmt, {"ids": tuple([id for id, _, _ in sample_solicitations])}).all()

In [101]:
len(results)

[1;36m6[0m

In [102]:
print(results[4][2])

SOLICITATION/CONTRACT/ORDER FOR COMMERCIAL ITEMS
OFFEROR TO COMPLETE BLOCKS 12, 17, 23, 24, & 30
3. AWARD/EFFECTIVE
DATE

2. CONTRACT NO.

Page 1 of 168

1. REQUISITION NUMBER
See Schedule

4. ORDER NUMBER

6. SOLICITATION ISSUE
DATE

5. SOLICITATION NUMBER
SPE602-24-R-0701
b. TELEPHONE NUMBER (No Collect
calls)

a. NAME

7. FOR SOLICITATION
INFORMATION CALL:

James Forde FPH1484

9. ISSUED BY

CODE

SPE602

10. THIS ACQUISITION IS

HUBZONE SMALL
BUSINESS

DLA ENERGY
BULK PETROLEUM PRODUCT
8725 JOHN J. KINGMAN ROAD
FORT BELVOIR VA 22060
USA

SERVICE-DISABLED
VETERAN-OWNED
SMALL BUSINESS

UNRESTRICTED OR

NAICS: 324110

8 (A)

31

% FOR:

SIZE STANDARD:
13b. RATING
14. METHOD OF SOLICITATION
RFQ

CODE

SET ASIDE:

EDWOSB

13a. THIS CONTRACT IS A
RATED ORDER UNDER
DPAS (15 CFR 700)

SEE SCHEDULE

03:00 PM

WOMEN-OWNED SMALL BUSINESS
(WOSB) ELIGIBLE UNDER THE WOMEN-OWNED
SMALL BUSINESS PROGRAM

12. DISCOUNT TERMS

15. DELIVER TO

8. OFFER DUE DATE/
LOCAL TIME
2024 MAR 28

Phone: 571-767-8