In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
from sqlalchemy import create_engine, select, values, update, and_, exists
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
import pendulum
import tempfile
import requests
import os
from tqdm import tqdm
import tiktoken
import json
import datetime
from openai import OpenAI
import anthropic

In [3]:
load_dotenv()
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")

In [5]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

In [6]:
pendulum.now().strftime("%Y%m%d:%H%M%S")

'20240312:080016'

In [7]:
with SessionLocal() as db:
    subquery = (
        select(ResourceLink.notice_id).
        where(and_(ResourceLink.notice_id == Notice.id, ResourceLink.text.isnot(None)))
    )
    stmt = (
        select(Notice).where(
            and_(Notice.postedDate == selected_date,
            (exists(subquery))
                 )
        )
    )
    results = db.execute(stmt).scalars().all()
    results_dict = [NoticeBase.model_validate(result).dict() for result in results]

In [8]:

with SessionLocal() as session:
    subquery = (
        select(ResourceLink.notice_id).
        where(and_(ResourceLink.notice_id == Notice.id, ResourceLink.text.isnot(None)))
    )
    stmt = (
        select(Notice).
        where(exists(subquery))
    )
    results = session.execute(stmt).scalars().all()
    result_dict = [NoticeBase.model_validate(result).dict() for result in results]

In [9]:
def num_tokens_in_corpus(input:str, encoding_name: str = "gpt-3.5-turbo-0125") -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

In [10]:
# for item in result_dict:
#     print(f"{item['title']}")
#     for resource_link in item['resource_links']:
#         print(f">>>{num_tokens_in_corpus(resource_link['text'])}")

In [11]:
type(result_dict[0])

dict

In [12]:
shorter_rfp = result_dict[0]['resource_links'][0]['text']
longer_rfp = result_dict[0]['resource_links'][1]['text']

In [13]:
num_tokens_in_corpus(shorter_rfp), num_tokens_in_corpus(longer_rfp)

(885, 23418)

In [14]:

(23000 / 1000000) * 3 

0.069

In [15]:
class DateTimeEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime.datetime):
            return obj.isoformat()
        return super().default(obj)

In [16]:
# with open('./data/example_resource.json', 'w') as f:
#     json.dump(result_dict[0], f, cls=DateTimeEncoder, indent=4)

In [17]:
with open('./data/example_resource.json', 'r') as f:
    notice = json.load(f)

In [18]:
client = OpenAI()

In [19]:
 res = client.chat.completions.create(model="gpt-3.5-turbo-0125", messages=[
    {
        "role": "system",
        "content": "You are a highly skilled AI trained to analyze text and summarize very succinctly.",
    },
    {
        "role": "user",
        "content": "The dog jumped over the big log that was laying in the forest. Yes the dog jumped. It jumped over the log. The log that was big. Big the log was. And the dog did jump over it. This all happened in the forest.",
    },
    # {
    #     "role": "assistant",
    #     "content": "My summary of the text is:"
    # }
    ])

In [20]:
res.usage

CompletionUsage(completion_tokens=11, prompt_tokens=78, total_tokens=89)

In [21]:
def gpt_text_summarization(text: str, model: str = 'gpt-3.5-turbo-0125'):
    client = OpenAI()
    model = model
    current_time = pendulum.now().strftime("%Y%m%d:%H%M%S")
    messages = [
    {
        "role": "system",
        "content": "You are a highly skilled AI trained to analyze text and summarize it very succinctly.",
    },
    {
        "role": "user",
        "content": "Please distill this text into it's most important parts for determining a good fit for a contractor or business that may want to provide a bid for the work. A good fit is one in which the contractor or business specializes in the requested services, and can provide those services at the correct scale. Please take all of these factors into account and return a detailed summary of no more than 7 sentences."
    },
    {
        "role": "user",
        "content": f"{text}",
    },
    ]
    res = client.chat.completions.create(model=model, messages=messages)
    completion_tokens = res.usage.completion_tokens
    prompt_tokens = res.usage.prompt_tokens
    total_tokens = res.usage.total_tokens
    data = {
        "Model": model,
        "Completion Tokens": completion_tokens,
        "Prompt Tokens": prompt_tokens,
        "Total Tokens": total_tokens,
        "Prompt": messages,
        "Response": res.choices[0].message.content,
    }
    with open(f"./completions/{current_time}-{model}.json", "w") as f:
        json.dump(data, f, indent=4)
    return res.choices[0].message.content

In [22]:

test_content = "The dog jumped over the big log that was laying in the forest. Yes the dog jumped. It jumped over the log. The log that was big. Big the log was. And the dog did jump over it. This all happened in the forest."

In [23]:
# text_summarization(test_content)

In [24]:
gpt_text_summarization(shorter_rfp)

'The RFQ is issued by USGS National Acquisition Branch and requires firm-fixed pricing for a Field spectroradiometer. The delivery is FOB destination to USGS LRS in Reston, VA. Quotations are due by 03/19/2024. The RFQ clarifies that submitted quotations are not offers and do not compel the Government to incur any costs. Potential bidders need to provide discounted pricing, and any attached representations and certifications must be completed. Contact for queries is Contracting Officer Brian Baker.'

Quick truncate the text length to fit the gpt-3.5 context window

In [25]:
# text_summarization(longer_rfp[:(round(len(longer_rfp)/2))])

***

## Claude

In [26]:
anthropic.Anthropic()

<anthropic.Anthropic at 0x7f162b0cb8e0>

In [27]:
client = anthropic.Anthropic()


In [28]:
message = client.messages.create(
    model="claude-3-sonnet-20240229",
    max_tokens=1000,
    temperature=0.0,
    
    system="Respond only in Spanish.",  # <-- system prompt
    messages=[{"role": "user", "content": "Hello, Claude!"}],  # <-- user prompt
)


In [29]:
message

Message(id='msg_015oxnM9WgBrSJAss2iPHnf3', content=[ContentBlock(text='¡Hola! Es un placer saludarte. Responderé en español como me lo has pedido.', type='text')], model='claude-3-sonnet-20240229', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=17, output_tokens=31))

In [30]:
message.usage

Usage(input_tokens=17, output_tokens=31)

In [31]:
def claude_text_summarization(text: str, max_tokens: int = 1000, temperature: float = 0.0, model: str = 'claude-3-sonnet-20240229'):
    client = anthropic.Anthropic()
    model = model
    current_time = pendulum.now().strftime("%Y%m%d:%H%M%S")
    max_tokens = max_tokens
    temperature = temperature
    system = "You are a highly skilled AI trained to analyze text and summarize it very succinctly."
    messages=[
    {
        "role": "user",
        "content": f"Please distill this text into its most important parts for determining a good fit for a contractor or business that may want to provide a bid for the work. A good fit is one in which the contractor or business specializes in the requested services, and can provide those services at the correct scale. Please take all of these factors into account and return a detailed summary of no more than 20 sentences: {text}"
    },
    ]
    res = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=temperature,
        system=system,
        messages=messages)
    completion_tokens = res.usage.output_tokens
    prompt_tokens = res.usage.input_tokens
    total_tokens = completion_tokens + prompt_tokens
    data = {
        "Model": model,
        "Completion Tokens": completion_tokens,
        "Prompt Tokens": prompt_tokens,
        "Total Tokens": total_tokens,
        "Prompt": messages,
        "Temperature": temperature,
        "Max_Tokens": max_tokens,
        "Response": res.content[0].text,
    }
    with open(f"./completions/{current_time}-{model}.json", "w") as f:
        json.dump(data, f, indent=4)
    return res.content[0].text

In [32]:
claude_text_summarization(shorter_rfp)

'Here is a detailed summary of the most important parts of the text for determining a good fit for a contractor or business to provide a bid, in no more than 20 sentences:\n\n1. This is a Request for Quotation (RFQ) issued by the USGS National Acquisition Branch for a field spectroradiometer. \n\n2. It is not a small business set-aside.\n\n3. Quotes are due by 1:00 PM Eastern Time on March 19, 2024.\n\n4. The required item is one (1) field spectroradiometer.\n\n5. Functional and technical requirements, as well as applicable clauses, are provided in Attachment A.\n\n6. Contractors should provide firm-fixed discounted pricing for the field spectroradiometer.\n\n7. Questions should be directed to the Contracting Officer, Brian Baker, at bfbaker@usgs.gov.\n\n8. Delivery is required within 60 days after award.\n\n9. Delivery is FOB Destination to USGS LRS, 12201 Sunrise Valley Dr. MS 517, Reston, VA 20192-0002.\n\n10. Contractors must complete any attached representations and certifications

In [None]:
# claude_text_summarization(longer_rfp)