In [89]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [110]:
from sqlalchemy import create_engine, select, values, update, and_
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
import pendulum
import tempfile
import requests
import os
from tqdm import tqdm
import tiktoken
import json
import datetime
from openai import OpenAI

In [91]:
load_dotenv()
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [111]:

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [92]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

In [93]:
with SessionLocal() as db:
    subquery = (
        select(ResourceLink.notice_id).
        where(and_(ResourceLink.notice_id == Notice.id, ResourceLink.text.isnot(None)))
    )
    stmt = (
        select(Notice).where(
            and_(Notice.postedDate == selected_date,
            (exists(subquery))
                 )
        )
    )
    results = db.execute(stmt).scalars().all()
    results_dict = [NoticeBase.model_validate(result).dict() for result in results]

In [94]:
from sqlalchemy import and_, exists

with SessionLocal() as session:
    subquery = (
        select(ResourceLink.notice_id).
        where(and_(ResourceLink.notice_id == Notice.id, ResourceLink.text.isnot(None)))
    )
    stmt = (
        select(Notice).
        where(exists(subquery))
    )
    results = session.execute(stmt).scalars().all()
    result_dict = [NoticeBase.model_validate(result).dict() for result in results]

In [95]:
def num_tokens_in_corpus(input:str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

In [97]:
for item in result_dict:
    print(f"{item['title']}")
    for resource_link in item['resource_links']:
        print(f">>>{num_tokens_in_corpus(resource_link['text'])}")

Field spectroradiometer
>>>885
>>>23418
Accelerated READER SUBSCRIPTION
>>>17500
23--AO 23925 NP UTV FOR TONOPAH FIELD OFFICE
>>>572
>>>1151
>>>572
>>>31885
Amendment 001 for Energy Savings Performance Contract (ESPC) - Minneapolis-St. Paul Air Reserve Station, MN
>>>4035
>>>1730
>>>1595
>>>1500
>>>2499
Solicitation_M2A2 Aiming Circle
>>>6125
>>>70598
>>>11432
>>>1129
>>>2143
>>>9658
>>>727
306 RQS - Phantom DZ Kit
>>>1162
>>>29461
>>>342
>>>177
WBN DRUM EAST STAND EXAM
>>>949
>>>8981
>>>2307
>>>2179
>>>5349
>>>2791
>>>21281
>>>3278
>>>291
Construction of Health Facilities (HFs) in Nampula and Cabo Delgado provinces.
>>>159120
>>>67269
>>>37348
>>>219170
>>>252072
>>>51406
>>>178800
>>>220809
>>>24134
>>>55354
>>>171006
>>>68222
>>>213
Evaluation, Calibration, and Repair of Air Data pressure test Sets (ADTS-3350ER)
>>>1854
>>>32607
>>>3
>>>190
Tools for Shipboard Repairs
>>>3
>>>658
EXCITER,IGNITION
>>>48751
Student Transportation Services - Kaiserslautern Military Community
>>>1658
>>

TypeError: expected string or buffer

In [105]:
(23000 / 1000000) * .50

0.0115

In [98]:
type(result_dict[0])

dict

In [99]:
result_dict[0]['resource_links'][0]

{'id': 36,
 'url': 'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/aac549c2ce194e6b90e18e088f87cb5f/download?api_key=null&token=',
 'text': 'REQUEST FOR QUOTATION\n(THIS IS NOT AN ORDER)\n\n1. REQUEST NO.\n\n140G0124Q0100\n\n5a. ISSUED BY\n\nUSGS NATIONAL ACQUISITION BRANCH\n205 NATIONAL CENTER\n12201 SUNRISE VALLEY DRIVE\nRESTON VA 20192\n\nTHIS RFQ\n\nX\n\nIS\n\n2. DATE ISSUED\n\n03/11/2024\n\nIS NOT A SMALL BUSINESS SET ASIDE\n\n3. REQUISITION/PURCHASE REQUEST NO.\n\nPAGE      OF\n\nPAGES\n\n1\n\n 29 \n\nRATING\n\nNAME\n\nBrian Baker\n\na. NAME\n\nc. STREET ADDRESS\n\n5b. FOR INFORMATION CALL: (No collect calls)\n\nTELEPHONE NUMBER\n\nAREA CODE\n\n000\n\nNUMBER\n\n000-0000\n\n8. TO:\n\nb. COMPANY\n\n4. CERT. FOR NAT. DEF.\nUNDER BDSA REG. 2\nAND/OR DMS REG.1\n\n6. DELIVERY BY (Date)\n\n60 Days After Award\n\n7. DELIVERY\n\nX\n\nFOB DESTINATION\n\na. NAME OF CONSIGNEE\n\nUSGS LRS\n\nb. STREET ADDRESS\n\n9. DESTINATION\n\nOTHER\n\n(See Schedule)\n\n12201 Sunrise Valley

In [100]:
class DateTimeEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime.datetime):
            return obj.isoformat()
        return super().default(obj)

In [101]:
# with open('./data/example_resource.json', 'w') as f:
#     json.dump(result_dict[0], f, cls=DateTimeEncoder, indent=4)

In [106]:
with open('./data/example_resource.json', 'r') as f:
    notice = json.load(f)

In [112]:
client = OpenAI()

In [114]:
 res = client.chat.completions.create(model="gpt-3.5-turbo-0125", messages=[
    {
        "role": "system",
        "content": "You are a highly skilled AI trained to analyze text and summarize very succinctly.",
    },
    {
        "role": "user",
        "content": "The dog jumped over the big log that was laying in the forest. Yes the dog jumped. It jumped over the log. The log that was big. Big the log was. And the dog did jump over it. This all happened in the forest.",
    },
    # {
    #     "role": "assistant",
    #     "content": "My summary of the text is:"
    # }
    ])

In [115]:
res

ChatCompletion(id='chatcmpl-91tIRCn16mUlrDcAcb69YFEiLjAmr', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The dog jumped over a big log in the forest.', role='assistant', function_call=None, tool_calls=None))], created=1710237915, model='gpt-3.5-turbo-0125', object='chat.completion', system_fingerprint='fp_4f0b692a78', usage=CompletionUsage(completion_tokens=11, prompt_tokens=78, total_tokens=89))

In [None]:
def text_summarization(text: str):
    return client.chat