In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
from sqlalchemy import create_engine, select, values, update, and_, exists, text
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
from dotenv import load_dotenv
import os
import requests
import json
import re
import pandas as pd
import pendulum

import tiktoken

import anthropic

In [3]:

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

In [5]:
prior_date = pendulum.now().subtract(days=1).strftime('%Y-%m-%d')

In [6]:
def num_tokens_in_corpus(input:str, encoding_name: str = "gpt-3.5-turbo") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

def est_costs(price_input_mil: float = 10.0, price_output_mil: float = 30.0, len_input: int = 0) -> float:
    price_per_token_input = price_input_mil / 1000000
    price_per_token_output = price_output_mil / 1000000
    print(f"Cost of input: {len_input * price_per_token_input}; Cost of output: {len_input * price_per_token_output}")

def haiku_cost(num_tokens: int) -> str:
    return est_costs(.25, 1.25, num_tokens)


In [7]:
with SessionLocal() as db:
    stmt = text("""select id, text, notice_id from resource_links 
                    where notice_id in 
                        (select id from notices
                            where
                            naics_code_id = 
                                (select id from naics_codes where \"naicsCode\" = 236220)
                                and
                                \"postedDate\" = :prior_date)
                    and 
                    text != 'unparsable' 
                    and
                    text != 'adobe-error'
                    and
                    text != 'encoding-error'
                    and
                    text is not null
                    and
                    file_tokens < :max_tokens
                """) 
    results = db.execute(stmt, params={"prior_date": prior_date, "max_tokens": int(100000)}).all()

In [8]:
sample_group = results[:5]

In [9]:
client = anthropic.Anthropic()

In [10]:
message = client.messages.create(
    model="claude-3-haiku-20240307",
    max_tokens=1000,
    temperature=0.0,
    system="Respond only in Spanish.",  # <-- system prompt
    messages=[{"role": "user", "content": "Hello, Claude!"}],  # <-- user prompt
)

In [11]:
message

Message(id='msg_01V76DQFhUgdWib4wzw51Zsv', content=[ContentBlock(text='¡Hola! Es un placer conocerte. ¿Cómo puedo ayudarte hoy?', type='text')], model='claude-3-haiku-20240307', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=17, output_tokens=32))

In [12]:
client = anthropic.Anthropic()

In [13]:
def claude_text_summarization(client: anthropic.Anthropic, text: str, max_tokens: int = 1000, temperature: float = 0.0, model: str = 'claude-3-haiku-20240307'):
    model = model
    current_time = pendulum.now().strftime("%Y%m%d:%H%M%S")
    max_tokens = max_tokens
    temperature = temperature
    system = "You are a highly skilled AI trained to analyze text and summarize it very succinctly."
    messages=[
    {
        "role": "user",
        "content": f"""Analyze the provided government contracting document to extract key information that will help contractors assess whether the project aligns with their capabilities and is worth pursuing. Focus on the following aspects:

            1. Scope of Work: What specific services or products does the project require?
            2. Special Equipment Needed: Are there unique tools or machinery necessary for project completion?
            3. Domain of Expertise Required: What specialized knowledge or skills are needed?
            4. Contractor Workforce Size: Estimate the workforce size needed to meet project demands.

            Additionally, consider these factors to further refine suitability assessment:
            - Project Duration: How long is the project expected to last?
            - Location and Logistics: Where is the project located, and are there significant logistical considerations?
            - Budget and Payment Terms: What is the budget range, and how are payments structured?
            - Compliance and Regulations: Are there specific industry regulations or standards to comply with?
            - Past Performance Requirements: Is prior experience in similar projects a prerequisite?

            Summarize these elements in no more than 25 sentences to provide a comprehensive overview, enabling contractors to quickly determine project compatibility and feasibility. Highlight any potential challenges or requirements that may necessitate additional considerations.
            Text is below:
            {text}
            """ 
    },
    ]
    res = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        system=system,
        messages=messages)
    completion_tokens = res.usage.output_tokens
    prompt_tokens = res.usage.input_tokens
    total_tokens = completion_tokens + prompt_tokens
    data = {
        "Model": model,
        "Completion Tokens": completion_tokens,
        "Prompt Tokens": prompt_tokens,
        "Total Tokens": total_tokens,
        "Prompt": messages,
        "Temperature": temperature,
        "Max_Tokens": max_tokens,
        "Response": res.content[0].text,
    }
    with open(f"./completions/{current_time}-{model}.json", "w") as f:
        json.dump(data, f, indent=4)
    return res.content[0].text

In [15]:
results

[]

In [14]:
summary = claude_text_summarization(client, results[0][1])

IndexError: list index out of range

In [None]:
print(summary)

In [None]:
len(sample_group)

We're going to have to sleep, everybody wants to use Anthropic at the moment so I'm hitting a lot of rate limits that I shouldn't be

In [None]:
import time

In [None]:
def anthropic_traffic_jam(client, doc: str):
    result = claude_text_summarization(client, doc)
    time.sleep(20)
    return result

In [None]:
summaries = [claude_text_summarization(client, doc) for _, doc, _ in sample_group]

In [None]:
summaries

In [None]:
previous_date = pendulum.now('utc').subtract(days=1).strftime('%Y%m%d')

In [None]:
len(sample_group)

In [None]:
sample_group[0]

In [None]:
new_summaries = [(sample_group[i][0], summaries[i]) for i in range(len(sample_group))]

In [None]:
new_summaries[0][0]

In [None]:
with SessionLocal() as db:
    for summary in new_summaries:
        stmt = (
            update(ResourceLink)
            .where(ResourceLink.id == summary[0])
            .values(summary=summary[1])
        )
        db.execute(stmt)
        db.commit()

In [None]:
import boto3

S3_AWS_ACCESS_KEY_ID = os.environ.get("S3_AWS_ACCESS_KEY_ID")
S3_AWS_SECRET_ACCESS_KEY = os.environ.get("S3_AWS_SECRET_ACCESS_KEY")
S3_REGION_NAME = os.environ.get("S3_REGION_NAME")

In [None]:
s3_client = boto3.client(
    "s3",
    region_name=S3_REGION_NAME,
    aws_access_key_id=S3_AWS_ACCESS_KEY_ID,
    aws_secret_access_key=S3_AWS_SECRET_ACCESS_KEY,
)
bucket_name = "sam-resource-links-haiku-summaries"
file_name = f"{bucket_name}/{previous_date}/sample.json"