In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

# Refining Prompt for Summaries

In [2]:
from sqlalchemy import create_engine, select, values, update, and_, exists, text, distinct
from sqlalchemy.orm import sessionmaker, join
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase, ResourceLinkSimple
from dotenv import load_dotenv
import os
import requests
import json
import re
import pandas as pd
from pyspark.sql import SparkSession
import pendulum
from pyspark.sql.functions import col
from pydantic import BaseModel, ConfigDict
from typing_extensions import Optional, List
import tiktoken

import anthropic

In [3]:
# Env
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"
# Database
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")
# Date Offset
prior_date = pendulum.now().subtract(days=2).strftime('%Y-%m-%d')

Cost Calc Funcs

In [4]:
def num_tokens_in_corpus(input:str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

def est_costs(price_input_mil: float = 10.0, price_output_mil: float = 30.0, len_input: int = 0) -> float:
    price_per_token_input = price_input_mil / 1000000
    price_per_token_output = price_output_mil / 1000000
    print(f"Cost of input: {len_input * price_per_token_input}; Cost of output: {len_input * price_per_token_output}")

def haiku_cost(num_tokens: int) -> str:
    return est_costs(.25, 1.25, num_tokens)

## Get Sample Batch

### Get Procurement Notices and Resource Links 


Get 5 unique `notice_ids` and their related `resource_links`

In [5]:
class ResourceLinkTemp(BaseModel):
    model_config = ConfigDict(from_attributes=True)
    
    notice_id: Optional[str]
    resource_link_id: Optional[int]
    text: Optional[str]
    file_tokens: Optional[int]

In [6]:
def get_resource_links_for_notices():
    with SessionLocal() as db:
        subquery = db.query(Notice.id).limit(1).scalar_subquery()

        results = db.query(Notice.id, ResourceLink.id, ResourceLink.text, ResourceLink.file_tokens) \
                            .join(ResourceLink, Notice.id == ResourceLink.notice_id) \
                            .filter(Notice.id.in_(subquery)) \
                            .filter(ResourceLink.file_tokens < 5000) \
                            .all()
        data = [ResourceLinkTemp.model_validate({"notice_id": result[0], "resource_link_id": result[1], "text": result[2], "file_tokens": result[3]}) for result in results]
    return data

In [7]:
results = get_resource_links_for_notices()

In [8]:
df = pd.DataFrame([dict(result) for result in results])

In [9]:
df['notice_id'].unique()

array(['ff3119bdd2f642b7afc3887b3abcc153'], dtype=object)

In [10]:
df.shape

(2, 4)

In [11]:
df

Unnamed: 0,notice_id,resource_link_id,text,file_tokens
0,ff3119bdd2f642b7afc3887b3abcc153,3,Theater Aircraft Corrosion Control\nPrep Hanga...,2493
1,ff3119bdd2f642b7afc3887b3abcc153,2,PRE-BID SITE VISIT ROSTER:\nW912HV23R0004 LXEZ...,1144


In [12]:
df['text'].iloc[0][:1000]

'Theater Aircraft Corrosion Control\nPrep Hangar, Kadena AB, Okinawa\nPre-Proposal Conference\n26 September 2023\n\nJack T. Letscher\nContracting Officer\nOkinawa Area Office, Contracting Division\nDisclaimer：The Japanese translation is provided as supplement reference. All original English contents shall prevail if there’s any inconsistencies.\n\n免責事項：本和訳文中の機関名等はすべて英文の原文に基づく便宜的な仮訳であり、原文との齟齬がある場合には原文の記載を優先します。\n\n\x0c2\n\nINTRODUCTION\n\nWELCOME AND OPENING REMARKS\n\n\x0cPROJECT DESCRIPTION:\nDesign-Bid-Build:\nThis requirement is for an adequately sized and configured Aircraft Corrosion Control Facility\nto provide hangar space for corrosion treating, corrosion repairing, paint stripping and\nrepainting of an entire aircraft and an environmentally controlled area to wash aircraft. The\nfacility shall consist of a single bay paint booth for mixing and applying paint, a single bay\nprep/wash hangar for the corrosion control shop, preparation with abrasive blasting rooms\nand drying are

In [15]:
client = anthropic.Anthropic()

In [16]:
def claude_text_summarization(system: str, prompt: str, text: str, client: anthropic.Anthropic = client , max_tokens: int = 1000, temperature: float = 0.0, model: str = 'claude-3-haiku-20240307') -> str:
    model = model
    current_time = pendulum.now().strftime("%Y%m%d:%H%M%S")
    max_tokens = max_tokens
    temperature = temperature
    system = system
    messages=[
    {
        "role": "user",
        "content": f"{prompt}: {text}" 
    },
    ]
    res = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        system=system,
        messages=messages)
    completion_tokens = res.usage.output_tokens
    prompt_tokens = res.usage.input_tokens
    total_tokens = completion_tokens + prompt_tokens
    data = {
        "Model": model,
        "Completion Tokens": completion_tokens,
        "Prompt Tokens": prompt_tokens,
        "Total Tokens": total_tokens,
        "Prompt": messages,
        "Temperature": temperature,
        "Max_Tokens": max_tokens,
        "Response": res.content[0].text,
    }
    with open(f"./completions/{current_time}-{model}.json", "w") as f:
        json.dump(data, f, indent=4)
    return res.content[0].text

In [28]:

system = """
You are a specialized AI for dissecting and summarizing government procurement contracts.
Here are instructions from the user outlining your tasks and how you should respond:
Your role is to meticulously parse and condense the most salient aspects from a variety of government procurement 
documents, including bid invitations, Requests for Proposals (RFPs), and finalized contracts. Key points of interest 
for potential contractors and bidders, such as detailed specifications, qualification criteria, submission deadlines, 
financial terms, performance expectations, and eligibility requirements concerning business size and certifications, 
should be prominently outlined. Given that your response will be segmented into discrete chunks, please ensure each 
unit of information is separated by two newlines for clear demarcation. These segments must be semantically rich to 
facilitate their use in a vector database, enabling efficient retrieval and analysis.

Rules:
- Do NOT preface your summary by saying 'Here is a summary of...' or anything of the sort. Jump right in to the summary and do NOT speak to the user directly. 
""" 

In [32]:
system = """You are an AI developed for the precise task of breaking down and summarizing government procurement contracts.
Your mission is to sift through and boil down the essential elements from various government procurement documents, including bid invitations, Requests for Proposals (RFPs), 
and completed contracts. Focus on extracting critical information that potential contractors and bidders need, 
like detailed specifications, qualification criteria, submission deadlines, financial terms, performance standards, 
and conditions related to business size and certifications. Responses must be divided into clear, semantically dense 
chunks, separated by two newlines, ready for inclusion in a vector database for streamlined access and analysis.

Instructions:

Begin your summaries without any introductory phrases. Directly present the distilled information, avoiding direct communication with the user."""

In [29]:
prompt = "Please summarize the following document. In addition to your normal analysis, please highlight any related skills or suite of services that would be helpful for the contractor to have."
text = df['text'].iloc[0]

In [30]:
res = claude_text_summarization(system, prompt, text)

In [31]:
print(res)

Here is a summary of the key points from the Theater Aircraft Corrosion Control Prep Hangar procurement document:

The project is for the design and construction of an aircraft corrosion control facility at Kadena Air Base in Okinawa, Japan. The facility will include a hangar space for corrosion treatment, repair, paint stripping, and repainting of entire aircraft, as well as an environmentally controlled aircraft wash area. Supporting facilities like hazardous material storage, utility storage, utilities, HVAC, fire protection, communications, pavement, and demolition of an existing building are also required.

The solicitation number is W912HV23R0004, with an estimated contract value between ¥25,000,000,000 and ¥50,000,000,000. Key dates are:
- RFI (Round #1) Due: 27 October 2023
- Proposals Due: 23 February 2024 
- Bid Acceptance Period: 150 Calendar days

The evaluation factors for the technical proposal include:
1. General Construction Experience - Contractor must have completed a