In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
from sqlalchemy import create_engine, select, values, update, and_, exists, text
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
from dotenv import load_dotenv
import os
import requests
import json
import re
import pandas as pd
import pendulum

import tiktoken


In [3]:

load_dotenv()

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

Get all parsed 

In [5]:
with SessionLocal() as db:
    stmt = text("""select text from resource_links 
                    where 
                    notice_id IN (select id from notices where \"postedDate\" = '2024-03-13') 
                    and 
                    text != 'unparsable' 
                    and
                    text is not null""") 
    results = db.execute(stmt).scalars().all()

In [6]:
len(results)

1316

In [7]:
all_texts = " ".join(results)
len(all_texts)

62793451

In [8]:
def num_tokens_in_corpus(input:str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

In [9]:
def est_costs(price_input_mil: float = 10.0, price_output_mil: float = 30.0, len_input: int = 0) -> float:
    price_per_token_input = price_input_mil / 1000000
    price_per_token_output = price_output_mil / 1000000
    print(f"Cost of input: {len_input * price_per_token_input}; Cost of output: {len_input * price_per_token_output}")

In [10]:
num_tokens = num_tokens_in_corpus(all_texts)

| Model                 | Input Cost | Output Cost |
| --------------------- | ---------- | ----------- |
| gpt-4                 | $30.00     | $60.00      |
| gpt-4-32k             | $60.00     | $120.00     |
| gpt-3.5-turbo-0125    | $0.50      | $1.50       |
| gpt-3.5-turbo-instruct| $1.50      | $2.00       |
| haiku          | $0.25      | $1.25       |
| sonnet | $3.00      | $15.00      |
| opus | $5.00      | $75.00      |

In [11]:
def gpt_4(num_tokens: int) -> str:
    return est_costs(30, 60, num_tokens)

In [12]:
def gpt_3_5(num_tokens: int) -> str:
    return est_costs(.50, 1.5, num_tokens)

In [13]:
def haiku(num_tokens: int) -> str:
    return est_costs(.25, 1.25, num_tokens)

In [14]:
def sonnet(num_tokens: int) -> str:
    return est_costs(.25, 1.25, num_tokens)

In [15]:
gpt_4(num_tokens), gpt_3_5(num_tokens), haiku(num_tokens), sonnet(num_tokens)

Cost of input: 500.45961; Cost of output: 1000.91922
Cost of input: 8.3409935; Cost of output: 25.0229805
Cost of input: 4.17049675; Cost of output: 20.85248375
Cost of input: 4.17049675; Cost of output: 20.85248375


(None, None, None, None)

In [16]:
with SessionLocal() as db:
    stmt = text("""select length(text) as len from resource_links 
                    where 
                    notice_id IN (select id from notices where \"postedDate\" = '2024-03-13') 
                    and 
                    text != 'unparsable' 
                    and
                    text is not null
                    order by len desc
                """) 
    results = db.execute(stmt).scalars().all()

In [17]:
results

[1392976,
 933797,
 813461,
 741860,
 675083,
 640378,
 599168,
 590270,
 586846,
 539654,
 462737,
 457629,
 450371,
 434195,
 420034,
 414778,
 411548,
 408723,
 398130,
 349388,
 324772,
 310945,
 297856,
 295867,
 291969,
 290777,
 290777,
 290643,
 287820,
 281696,
 280545,
 276597,
 273721,
 271729,
 271651,
 271248,
 271248,
 269741,
 265781,
 255209,
 249787,
 244304,
 243934,
 243397,
 238970,
 237282,
 236011,
 230908,
 229524,
 227566,
 226435,
 226212,
 225167,
 224751,
 223168,
 221823,
 221705,
 221400,
 221400,
 218181,
 218144,
 218136,
 217381,
 216345,
 215725,
 215344,
 214686,
 213643,
 210791,
 210048,
 209901,
 207699,
 201950,
 201925,
 201628,
 201168,
 200987,
 200706,
 197331,
 196415,
 194962,
 191154,
 190006,
 189910,
 189162,
 188650,
 188424,
 187896,
 184154,
 184067,
 183604,
 183444,
 181277,
 180200,
 179907,
 178584,
 178582,
 177593,
 177124,
 176236,
 175765,
 172231,
 169465,
 169279,
 168819,
 165046,
 163284,
 163063,
 162933,
 162643,
 162393,


ID `236220` is Commercial and Institutional Building construction, the category that usually has the most frequent additions

In [18]:
with SessionLocal() as db:
    stmt = text("""select text from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = '2024-03-13')
                """) 
    results = db.execute(stmt).scalars().all()

In [19]:
len(results)

321

In [20]:
all_texts = " ".join(results)
len(all_texts)

9218229

In [21]:

num_tokens = num_tokens_in_corpus(all_texts)

In [22]:
gpt_3_5(num_tokens), haiku(num_tokens)

Cost of input: 1.0881305; Cost of output: 3.2643915
Cost of input: 0.54406525; Cost of output: 2.7203262500000003


(None, None)

In [23]:
with SessionLocal() as db:
    stmt = text("""select text from (select text, length(text) as len from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = '2024-03-13'))
                    where len < 150000
                    order by len desc 
                """) 
    results = db.execute(stmt).scalars().all()

In [24]:
all_texts = " ".join(results)
len(all_texts)

3447175

In [25]:
num_tokens = num_tokens_in_corpus(all_texts)

In [26]:
gpt_3_5(num_tokens), haiku(num_tokens)

Cost of input: 0.4292125; Cost of output: 1.2876375
Cost of input: 0.21460625; Cost of output: 1.07303125


(None, None)

Selecting just the first 5000 chars of a text to see if it's a solicitation notice, which is all that I want to condense for the time being

In [27]:
with SessionLocal() as db:
    stmt = text("""select text from resource_links 
                    where 
                    notice_id IN (select id from notices where \"postedDate\" = '2024-03-13') 
                    and 
                    text != 'unparsable' 
                    and
                    text is not null""") 
    results = db.execute(stmt).scalars().all()

In [28]:
result_heads = [result[:2000] for result in results]

In [38]:
result_heads[0]

'Please wait...\nIf this message is not eventually replaced by the proper contents of the document, your PDF\nviewer may not be able to display this type of document.\nYou can upgrade to the latest version of Adobe Reader for Windows®, Mac, or Linux® by\nvisiting http://www.adobe.com/go/reader_download.\nFor more assistance with Adobe Reader visit http://www.adobe.com/go/acrreader.\nWindows is either a registered trademark or a trademark of Microsoft Corporation in the United States and/or other countries. Mac is a trademark\nof Apple Inc., registered in the United States and other countries. Linux is the registered trademark of Linus Torvalds in the U.S. and other\ncountries.'

In [30]:
combined_heads = " ".join(result_heads)

In [31]:
tokens_heads = num_tokens_in_corpus(combined_heads)
tokens_heads

702405

In [32]:
gpt_3_5(tokens_heads), haiku(tokens_heads)

Cost of input: 0.3512025; Cost of output: 1.0536075
Cost of input: 0.17560125; Cost of output: 0.8780062500000001


(None, None)

In [49]:
sample_heads = result_heads[:50]

In [51]:
sample_heads[0]

'Please wait...\nIf this message is not eventually replaced by the proper contents of the document, your PDF\nviewer may not be able to display this type of document.\nYou can upgrade to the latest version of Adobe Reader for Windows®, Mac, or Linux® by\nvisiting http://www.adobe.com/go/reader_download.\nFor more assistance with Adobe Reader visit http://www.adobe.com/go/acrreader.\nWindows is either a registered trademark or a trademark of Microsoft Corporation in the United States and/or other countries. Mac is a trademark\nof Apple Inc., registered in the United States and other countries. Linux is the registered trademark of Linus Torvalds in the U.S. and other\ncountries.'

In [45]:
import instructor
from openai import OpenAI
import logging
from pydantic import BaseModel, Field
from typing_extensions import Optional, List


logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


In [46]:
class DocType(BaseModel):
    solicitation: bool = Field(..., description="Indicates if the document is a solicitation (RFP)")
    amendment: bool = Field(..., description="Indicates if the document is an amendment")
    other: Optional[str] = Field(None, description="Type of document if it's neither solicitation nor amendment, up to the discretion of the LLM")

In [47]:
client = instructor.patch(OpenAI())

2024-03-14 11:34:09,466 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-03-14 11:34:09,469 - DEBUG - load_verify_locations cafile='/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/certifi/cacert.pem'
2024-03-14 11:34:09,497 - DEBUG - Patching `client.chat.completions.create` with mode=<Mode.TOOLS: 'tool_call'>


In [48]:
def classify_document_type(document_head: str) -> DocType:
    return client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[
            {
                "role": "user",
                "content": f"Classify the following document head as solicitation (RFP), amendment, or specify another type: {document_head}. These are just the heads of the documents so you must extrapolate what might be beyond for your classifcation based on this context."
            }
        ],
        response_model=DocType
    )

In [52]:

sample_classifications = [classify_document_type(sample_head) for sample_head in sample_heads] 

2024-03-14 11:36:41,337 - DEBUG - Instructor Request: mode.value='tool_call', response_model=<class '__main__.DocType'>, new_kwargs={'model': 'gpt-3.5-turbo-0125', 'messages': [{'role': 'user', 'content': 'Classify the following document head as solicitation (RFP), amendment, or specify another type: Please wait...\nIf this message is not eventually replaced by the proper contents of the document, your PDF\nviewer may not be able to display this type of document.\nYou can upgrade to the latest version of Adobe Reader for Windows®, Mac, or Linux® by\nvisiting http://www.adobe.com/go/reader_download.\nFor more assistance with Adobe Reader visit http://www.adobe.com/go/acrreader.\nWindows is either a registered trademark or a trademark of Microsoft Corporation in the United States and/or other countries. Mac is a trademark\nof Apple Inc., registered in the United States and other countries. Linux is the registered trademark of Linus Torvalds in the U.S. and other\ncountries.. These are ju

In [54]:
for i in sample_classifications:
    print(i)

solicitation=False amendment=False other='General Document'
solicitation=False amendment=False other='Contractor Certification'
solicitation=False amendment=False other='Certificate of Authorization'
solicitation=True amendment=False other=None
solicitation=False amendment=True other=None
solicitation=False amendment=False other='Contract Data Requirement List (CDRL)'
solicitation=False amendment=False other='General Information'
solicitation=True amendment=True other=None
solicitation=False amendment=False other='Master Specifications'
solicitation=True amendment=False other=None
solicitation=False amendment=False other='Head of Document'
solicitation=False amendment=False other='Management and Administration document'
solicitation=False amendment=False other='Project Description'
solicitation=True amendment=True other=None
solicitation=True amendment=False other=None
solicitation=True amendment=False other=None
solicitation=False amendment=False other='Memorandum for Site Visit'
soli