In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
from sqlalchemy import create_engine, select, values, update, and_, exists, text
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
from dotenv import load_dotenv
import os
import requests
import json
import re
import pandas as pd
import pendulum

import tiktoken


In [3]:

load_dotenv()

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

Get all parsed 

In [5]:
with SessionLocal() as db:
    stmt = text("""select text from resource_links 
                    where 
                    notice_id IN (select id from notices where \"postedDate\" = '2024-03-13') 
                    and 
                    text != 'unparsable' 
                    and
                    text is not null""") 
    results = db.execute(stmt).scalars().all()

In [6]:
len(results)

1316

In [7]:
all_texts = " ".join(results)
len(all_texts)

62793451

In [8]:
def num_tokens_in_corpus(input:str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

In [9]:
def est_costs(price_input_mil: float = 10.0, price_output_mil: float = 30.0, len_input: int = 0) -> float:
    price_per_token_input = price_input_mil / 1000000
    price_per_token_output = price_output_mil / 1000000
    print(f"Cost of input: {len_input * price_per_token_input}; Cost of output: {len_input * price_per_token_output}")

In [10]:
num_tokens = num_tokens_in_corpus(all_texts)

| Model                 | Input Cost | Output Cost |
| --------------------- | ---------- | ----------- |
| gpt-4                 | $30.00     | $60.00      |
| gpt-4-32k             | $60.00     | $120.00     |
| gpt-3.5-turbo-0125    | $0.50      | $1.50       |
| gpt-3.5-turbo-instruct| $1.50      | $2.00       |
| haiku          | $0.25      | $1.25       |
| sonnet | $3.00      | $15.00      |
| opus | $5.00      | $75.00      |

In [11]:
def gpt_4(num_tokens: int) -> str:
    return est_costs(30, 60, num_tokens)

In [12]:
def gpt_3_5(num_tokens: int) -> str:
    return est_costs(.50, 1.5, num_tokens)

In [13]:
def haiku(num_tokens: int) -> str:
    return est_costs(.25, 1.25, num_tokens)

In [14]:
def sonnet(num_tokens: int) -> str:
    return est_costs(.25, 1.25, num_tokens)

In [15]:
gpt_4(num_tokens), gpt_3_5(num_tokens), haiku(num_tokens), sonnet(num_tokens)

Cost of input: 500.45961; Cost of output: 1000.91922
Cost of input: 8.3409935; Cost of output: 25.0229805
Cost of input: 4.17049675; Cost of output: 20.85248375
Cost of input: 4.17049675; Cost of output: 20.85248375


(None, None, None, None)

In [None]:
with SessionLocal() as db:
    stmt = text("""select length(text) as len from resource_links 
                    where 
                    notice_id IN (select id from notices where \"postedDate\" = '2024-03-13') 
                    and 
                    text != 'unparsable' 
                    and
                    text is not null
                    order by len desc
                """) 
    results = db.execute(stmt).scalars().all()

In [None]:
results

ID `236220` is Commercial and Institutional Building construction, the category that usually has the most frequent additions

In [34]:
with SessionLocal() as db:
    stmt = text("""select text from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = '2024-03-13')
                """) 
    results = db.execute(stmt).scalars().all()

In [36]:
len(results)

321

In [37]:
all_texts = " ".join(results)
len(all_texts)

9218229

In [38]:

num_tokens = num_tokens_in_corpus(all_texts)

In [41]:
gpt_3_5(num_tokens), haiku(num_tokens)

Cost of input: 1.0881305; Cost of output: 3.2643915
Cost of input: 0.54406525; Cost of output: 2.7203262500000003


(None, None)

In [54]:
with SessionLocal() as db:
    stmt = text("""select text from (select text, length(text) as len from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = '2024-03-13'))
                    where len < 150000
                    order by len desc 
                """) 
    results = db.execute(stmt).scalars().all()

In [56]:
all_texts = " ".join(results)
len(all_texts)

2447557

In [57]:
num_tokens = num_tokens_in_corpus(all_texts)

In [58]:
gpt_3_5(num_tokens), haiku(num_tokens)

Cost of input: 0.32060649999999996; Cost of output: 0.9618195
Cost of input: 0.16030324999999998; Cost of output: 0.80151625


(None, None)