In [2]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [15]:
from sqlalchemy import create_engine, select, values, update, and_, exists, text
from sqlalchemy.orm import sessionmaker, join
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
from dotenv import load_dotenv
import os
import requests
import json
import re
import pandas as pd
import pendulum

import tiktoken

import anthropic

In [4]:

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [5]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

In [6]:
prior_date = pendulum.now().subtract(days=1).strftime('%Y-%m-%d')

In [7]:
def num_tokens_in_corpus(input:str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

def est_costs(price_input_mil: float = 10.0, price_output_mil: float = 30.0, len_input: int = 0) -> float:
    price_per_token_input = price_input_mil / 1000000
    price_per_token_output = price_output_mil / 1000000
    print(f"Cost of input: {len_input * price_per_token_input}; Cost of output: {len_input * price_per_token_output}")

def haiku_cost(num_tokens: int) -> str:
    return est_costs(.25, 1.25, num_tokens)


In [32]:
with SessionLocal() as db:
    stmt = text(
        """select text from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = :prior_date)
                    and 
                    text != 'unparsable' 
                    and
                    text != 'adobe-error'
                    and
                    text != 'encoding-error'
                    and
                    text is not null
                    and
                    file_tokens < :max_input_tokens
                    limit :bs
                """
    )
    results = (
        db.execute(
            stmt,
            params={
                "prior_date": prior_date,
                "max_input_tokens": 5000,
                "bs": None,
            },
        )
        .scalars()
        .all()
    )

In [33]:
len(results)

135

In [7]:
with SessionLocal() as db:
    stmt = text("""select id, text, notice_id from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = :prior_date)
                    and 
                    text != 'unparsable' 
                    and
                    text != 'adobe-error'
                    and
                    text != 'encoding-error'
                    and
                    text is not null
                    and
                    file_tokens < :max_tokens
                """) 
    results = db.execute(stmt, params={"prior_date": prior_date, "max_tokens": int(100000)}).all()

In [8]:
results

[(1778, 'RFP 36C26124R0004\n\nProject No. 612-23-003\n\nDesign-Build to Create Martinez Parking Lot and Access\n\nVA Northern California Health Care System (V ... (4718 characters truncated) ... ct Specialist at e-mail \n\nRosario.\n\nChaidez1@va.go\n\nv \n\nand Daniel Jhun at \n\nDaniel.jhun@va.gov\n\nThanks for attending today’s site visit!', 'e4e13fe07b4f4fdf814db4f04c1e8a29'),
 (1782, "STATEMENT OF WORK\nFOR\nDESIGN BUILD SERVICES\nCREATE MARTINEZ PARKING LOT AND ACCESS\nPROJECT #: 612-23-003\nVA NORTHERN CALIFORNIA HEALTH CARE SYST ... (27551 characters truncated) ... 3-01189-267\n\n14. ATTACHMENTS\n14.1. Site Location\n14.2. As-builts Drawings\n14.3. Geotechnical Report\n(END OF STATEMENT OF WORK)\n\nPage 12 of 12", 'e4e13fe07b4f4fdf814db4f04c1e8a29'),
 (1789, "BPA NO.\n\nAMENDMENT OF SOLICITATION/MODIFICATION OF CONTRACT\n2. AMENDMENT/MODIFICATION NUMBER\n\n3. EFFECTIVE DATE\n\n1. CONTRACT ID CODE\n\nPAGE\n ... (3912 characters truncated) ... ANDARD FORM 30 (REV. 11/2016)\nPresc

In [9]:
sample_group = results[:5]

In [10]:
sample_group

[(1778, 'RFP 36C26124R0004\n\nProject No. 612-23-003\n\nDesign-Build to Create Martinez Parking Lot and Access\n\nVA Northern California Health Care System (V ... (4718 characters truncated) ... ct Specialist at e-mail \n\nRosario.\n\nChaidez1@va.go\n\nv \n\nand Daniel Jhun at \n\nDaniel.jhun@va.gov\n\nThanks for attending today’s site visit!', 'e4e13fe07b4f4fdf814db4f04c1e8a29'),
 (1782, "STATEMENT OF WORK\nFOR\nDESIGN BUILD SERVICES\nCREATE MARTINEZ PARKING LOT AND ACCESS\nPROJECT #: 612-23-003\nVA NORTHERN CALIFORNIA HEALTH CARE SYST ... (27551 characters truncated) ... 3-01189-267\n\n14. ATTACHMENTS\n14.1. Site Location\n14.2. As-builts Drawings\n14.3. Geotechnical Report\n(END OF STATEMENT OF WORK)\n\nPage 12 of 12", 'e4e13fe07b4f4fdf814db4f04c1e8a29'),
 (1789, "BPA NO.\n\nAMENDMENT OF SOLICITATION/MODIFICATION OF CONTRACT\n2. AMENDMENT/MODIFICATION NUMBER\n\n3. EFFECTIVE DATE\n\n1. CONTRACT ID CODE\n\nPAGE\n ... (3912 characters truncated) ... ANDARD FORM 30 (REV. 11/2016)\nPresc

In [11]:
client = anthropic.Anthropic()

In [12]:
message = client.messages.create(
    model="claude-3-haiku-20240307",
    max_tokens=1000,
    temperature=0.0,
    system="Respond only in Spanish.",  # <-- system prompt
    messages=[{"role": "user", "content": "Hello, Claude!"}],  # <-- user prompt
)

In [13]:
message

Message(id='msg_01HqwNquUFUSFxdFxty4Jfud', content=[ContentBlock(text='¡Hola! Es un placer conocerte. ¿Cómo puedo ayudarte hoy?', type='text')], model='claude-3-haiku-20240307', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=17, output_tokens=32))

In [14]:
client = anthropic.Anthropic()

In [15]:
def claude_text_summarization(client: anthropic.Anthropic, text: str, max_tokens: int = 1000, temperature: float = 0.0, model: str = 'claude-3-haiku-20240307'):
    model = model
    current_time = pendulum.now().strftime("%Y%m%d:%H%M%S")
    max_tokens = max_tokens
    temperature = temperature
    system = "You are a highly skilled AI trained to analyze text and summarize it very succinctly."
    messages=[
    {
        "role": "user",
        "content": f"""Analyze the provided government contracting document to extract key information that will help contractors assess whether the project aligns with their capabilities and is worth pursuing. Focus on the following aspects:

            1. Scope of Work: What specific services or products does the project require?
            2. Special Equipment Needed: Are there unique tools or machinery necessary for project completion?
            3. Domain of Expertise Required: What specialized knowledge or skills are needed?
            4. Contractor Workforce Size: Estimate the workforce size needed to meet project demands.

            Additionally, consider these factors to further refine suitability assessment:
            - Project Duration: How long is the project expected to last?
            - Location and Logistics: Where is the project located, and are there significant logistical considerations?
            - Budget and Payment Terms: What is the budget range, and how are payments structured?
            - Compliance and Regulations: Are there specific industry regulations or standards to comply with?
            - Past Performance Requirements: Is prior experience in similar projects a prerequisite?

            Summarize these elements in no more than 25 sentences to provide a comprehensive overview, enabling contractors to quickly determine project compatibility and feasibility. Highlight any potential challenges or requirements that may necessitate additional considerations.
            Text is below:
            {text}
            """ 
    },
    ]
    res = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        system=system,
        messages=messages)
    completion_tokens = res.usage.output_tokens
    prompt_tokens = res.usage.input_tokens
    total_tokens = completion_tokens + prompt_tokens
    data = {
        "Model": model,
        "Completion Tokens": completion_tokens,
        "Prompt Tokens": prompt_tokens,
        "Total Tokens": total_tokens,
        "Prompt": messages,
        "Temperature": temperature,
        "Max_Tokens": max_tokens,
        "Response": res.content[0].text,
    }
    with open(f"./completions/{current_time}-{model}.json", "w") as f:
        json.dump(data, f, indent=4)
    return res.content[0].text

In [16]:
results

[(1778, 'RFP 36C26124R0004\n\nProject No. 612-23-003\n\nDesign-Build to Create Martinez Parking Lot and Access\n\nVA Northern California Health Care System (V ... (4718 characters truncated) ... ct Specialist at e-mail \n\nRosario.\n\nChaidez1@va.go\n\nv \n\nand Daniel Jhun at \n\nDaniel.jhun@va.gov\n\nThanks for attending today’s site visit!', 'e4e13fe07b4f4fdf814db4f04c1e8a29'),
 (1782, "STATEMENT OF WORK\nFOR\nDESIGN BUILD SERVICES\nCREATE MARTINEZ PARKING LOT AND ACCESS\nPROJECT #: 612-23-003\nVA NORTHERN CALIFORNIA HEALTH CARE SYST ... (27551 characters truncated) ... 3-01189-267\n\n14. ATTACHMENTS\n14.1. Site Location\n14.2. As-builts Drawings\n14.3. Geotechnical Report\n(END OF STATEMENT OF WORK)\n\nPage 12 of 12", 'e4e13fe07b4f4fdf814db4f04c1e8a29'),
 (1789, "BPA NO.\n\nAMENDMENT OF SOLICITATION/MODIFICATION OF CONTRACT\n2. AMENDMENT/MODIFICATION NUMBER\n\n3. EFFECTIVE DATE\n\n1. CONTRACT ID CODE\n\nPAGE\n ... (3912 characters truncated) ... ANDARD FORM 30 (REV. 11/2016)\nPresc

In [17]:
summary = claude_text_summarization(client, results[0][1])

In [18]:
print(summary)

Here is a 25-sentence summary of the key information from the government contracting document:

1. The project is a design-build construction requirement to create a new parking lot and access at the VA Martinez OPC campus in California.

2. The scope of work includes all design services, materials, equipment, investigations, and construction supervision to provide sufficient paved parking for the expanding campus.

3. Specialized equipment or machinery is not explicitly mentioned, suggesting standard construction tools and equipment will be required.

4. The project requires expertise in parking lot design, civil engineering, and construction management.

5. The workforce size needed is not specified, but the 307-calendar-day period of performance suggests a medium-sized crew will be required.

6. The project is expected to last approximately 10 months from the notice to proceed.

7. The project location is the VA Martinez OPC campus in Martinez, California, which may have some logist

In [19]:
len(sample_group)

5

We're going to have to sleep, everybody wants to use Anthropic at the moment so I'm hitting a lot of rate limits that I shouldn't be

In [20]:
import time

In [21]:
def anthropic_traffic_jam(client, doc: str):
    result = claude_text_summarization(client, doc)
    time.sleep(20)
    return result

In [22]:
summaries = [anthropic_traffic_jam(client, doc) for _, doc, _ in sample_group]

RateLimitError: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'Number of request tokens has exceeded your rate limit (https://docs.anthropic.com/claude/reference/rate-limits). Please reduce the the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}}

In [None]:
summaries

["Based on the provided government contracting document, here is a summary of the key information that would help contractors assess the project's alignment with their capabilities and feasibility:\n\n1. Scope of Work: The project involves design-build services for a parking facility at the Martinez (MTZ) location. The specific services or products required are not explicitly stated in the document provided.\n\n2. Special Equipment Needed: The document does not mention any unique tools or machinery necessary for project completion.\n\n3. Domain of Expertise Required: The document does not specify the specialized knowledge or skills needed for this project. However, it is likely that expertise in design-build construction, parking facility development, and compliance with relevant regulations would be required.\n\n4. Contractor Workforce Size: The document does not provide an estimate of the workforce size needed to meet the project demands. This would depend on the scope of work and pr

In [None]:
previous_date = pendulum.now('utc').subtract(days=1).strftime('%Y%m%d')

In [None]:
len(sample_group)

5

In [None]:
sample_group[0]

(1775, "BPA NO.\n\nAMENDMENT OF SOLICITATION/MODIFICATION OF CONTRACT\n2. AMENDMENT/MODIFICATION NUMBER\n\n3. EFFECTIVE DATE\n\n1. CONTRACT ID CODE\n\nPAGE\n ... (3893 characters truncated) ... TANDARD FORM 30 (REV. 11/2016)\nPrescribed by GSA - FAR (48 CFR) 53.243\n\n\x0cSee attached document: RFI Responses Design-Build MTZ Parking Part II.", 'e4e13fe07b4f4fdf814db4f04c1e8a29')

In [None]:
new_summaries = [(sample_group[i][0], sample_group[i][1], sample_group[i][2], summaries[i]) for i in range(len(sample_group))]

In [None]:
new_summaries[0][0]

1775

In [None]:
new_summaries[0][2]

'e4e13fe07b4f4fdf814db4f04c1e8a29'

In [None]:
with SessionLocal() as db:
    for summary in new_summaries:
        stmt = (
            update(ResourceLink)
            .where(ResourceLink.id == summary[0])
            .values(summary=summary[1])
        )
        db.execute(stmt)
        db.commit()

In [None]:

model="claude-3-haiku-20240307"

In [None]:
import boto3

S3_AWS_ACCESS_KEY_ID = os.environ.get("S3_AWS_ACCESS_KEY_ID")
S3_AWS_SECRET_ACCESS_KEY = os.environ.get("S3_AWS_SECRET_ACCESS_KEY")
S3_REGION_NAME = os.environ.get("S3_REGION_NAME")

In [None]:
s3_client = boto3.client(
    "s3",
    region_name=S3_REGION_NAME,
    aws_access_key_id=S3_AWS_ACCESS_KEY_ID,
    aws_secret_access_key=S3_AWS_SECRET_ACCESS_KEY,
)
bucket_name = "sam-resource-links-haiku-summaries"
file_name = f"{bucket_name}/{previous_date}/sample.json"

In [None]:
data = [{"notice_id": new_summary[2], "model": model, "summary": new_summary[3]} for new_summary in new_summaries]

In [None]:
json.dumps(data)[:150]

'[{"notice_id": "e4e13fe07b4f4fdf814db4f04c1e8a29", "model": "claude-3-haiku-20240307", "summary": "Based on the provided government contracting docume'

In [None]:
json_data = json.dumps(data)
bytes_data = json_data.encode('utf-8')
s3_client.put_object(Bucket=bucket_name, Key=file_name, Body=bytes_data)

{'ResponseMetadata': {'RequestId': 'X52FK7XCK70XZVBT',
  'HostId': 'lhomKi25fKXfI23AD+s45cYe081qCphO2orwg+iFnYuCwVtCSMDtbJfSEGdrosWOeOBDCTwfkmU=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'lhomKi25fKXfI23AD+s45cYe081qCphO2orwg+iFnYuCwVtCSMDtbJfSEGdrosWOeOBDCTwfkmU=',
   'x-amz-request-id': 'X52FK7XCK70XZVBT',
   'date': 'Sat, 16 Mar 2024 11:53:27 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"ce6ebdc584e5decbdb503b73fb7f7c9a"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"ce6ebdc584e5decbdb503b73fb7f7c9a"',
 'ServerSideEncryption': 'AES256'}