In [1]:
from dotenv import load_dotenv
import os
import requests
import json
import re
import fitz
import magic
import pandas as pd
from io import BytesIO
import pendulum
import boto3
import magic


import tiktoken

In [2]:
load_dotenv()
SAM_PUBLIC_API_KEY = os.environ.get("SAM_PUBLIC_API_KEY")
S3_AWS_ACCESS_KEY_ID = os.environ.get("S3_AWS_ACCESS_KEY_ID")
S3_AWS_SECRET_ACCESS_KEY = os.environ.get("S3_AWS_SECRET_ACCESS_KEY")
S3_REGION_NAME = os.environ.get("S3_REGION_NAME")
S3_BUCKET_OPPORTUNITIES = os.environ.get("S3_BUCKET_OPPORTUNITIES")

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Load Data

In [3]:
with open('./data/20240301.json') as f:
    data = json.load(f)

In [4]:
sample_data = data[0]

In [5]:
resource_links = [link for link in sample_data['resourceLinks']]

In [6]:
params = {
    "api_key": SAM_PUBLIC_API_KEY,
}

In [7]:
responses = [requests.get(link, params=params) for link in resource_links]
responses

[<Response [200]>, <Response [200]>]

In [8]:
file_types = [magic.from_buffer(response.content, mime=True) for response in responses]
file_types

['application/pdf', 'application/pdf']

In [9]:
pdf_responses = [response for response in responses if magic.from_buffer(response.content, mime=True) == 'application/pdf']


In [10]:
pdf_responses

[<Response [200]>, <Response [200]>]

In [11]:
sample_pdf = responses[0].content

In [12]:
solicitation_number = data[0]['solicitationNumber']

Converting PDF to raw text

In [13]:
import fitz  # PyMuPDF

def convert_pdf_to_text(open_pdf):
    doc = fitz.open(stream=open_pdf)
    
    full_text = ""
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        text = page.get_text()
        
        full_text += text + "\n"  # Add a newline character to separate pages
    
    doc.close()
    
    return full_text


In [14]:

pdf_text = convert_pdf_to_text(sample_pdf)
print(pdf_text)


SEE ADDENDUM
IS CHECKED
CODE 
18a. PAYMENT WILL BE MADE BY
CODE 
FACILITY
CODE 
17b. CHECK IF REMITTANCE IS DIFFERENT AND PUT SUCH ADDRESS IN OFFER 
OFFEROR
02RZ
BOISE ID 83705-5354
3833 S DEVELOPMENT AVE
USDA-FS AT-INCIDENT MGT SVCS BRANC
02RZ
CODE 
16. ADMINISTERED BY
CODE 
X
X
X
562991
SIZE STANDARD:
 100.00
% FOR:
SET ASIDE:
UNRESTRICTED OR
02RZ
REQUEST FOR 
PROPOSAL 
(RFP)
INVITATION 
FOR BID (IFB)
10. THIS ACQUISITION IS
CODE 
REQUEST FOR 
QUOTE (RFQ)
14. METHOD OF SOLICITATION
13b. RATING
NORTH AMERICAN INDUSTRY 
CLASSIFICATION STANDARD 
(NAICS):
SMALL BUSINESS
03/02/2024 2359 MT
02/01/2024
385-441-2764
KENNETH MILLER
0001
(No collect calls)
INFORMATION CALL:
FOR SOLICITATION
8. OFFER DUE DATE/LOCAL TIME
b. TELEPHONE  NUMBER
a. NAME
4. ORDER NUMBER
3. AWARD/
6. SOLICITATION 
1202RZ22Q0002
5. SOLICITATION NUMBER
SOLICITATION/CONTRACT/ORDER FOR COMMERCIAL ITEMS
1. REQUISITION NUMBER
PAGE     OF
1
 62 
OFFEROR TO COMPLETE BLOCKS 12, 17, 23, 24, & 30
TELEPHONE NO.
17a. CONTRACTOR/
B

In [15]:
for response in pdf_responses:
    pdf_text = convert_pdf_to_text(response.content)
    if solicitation_number in pdf_text:
        print(f"Solicitation number {solicitation_number} found in the PDF.")
    else:
        print(f"Solicitation number {solicitation_number} not found in the PDF.")

Solicitation number 1202RZ22Q0002 found in the PDF.
Solicitation number 1202RZ22Q0002 found in the PDF.


***

### Sidebar: Calculating OpenAI costs

In [16]:
def num_tokens_in_corpus(input:str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(input))
    return num_tokens

print(num_tokens_in_corpus(pdf_text, "gpt-4"))

44693


In [17]:
def current_openai_costs(price_input_mil: float = 10.0, price_output_mil: float = 30.0, len_input: int = 0) -> float:
    price_per_token_input = price_input_mil / 1000000
    price_per_token_output = price_output_mil / 1000000
    print(f"Cost of input: {len_input * price_per_token_input}; Cost of output: {len_input * price_per_token_output}")


In [18]:
current_openai_costs(len_input=num_tokens_in_corpus(pdf_text, "gpt-4"))

Cost of input: 0.44693000000000005; Cost of output: 1.34079


In [19]:

current_openai_costs(price_input_mil=0.5, price_output_mil=1.5, len_input=num_tokens_in_corpus(pdf_text, "gpt-3.5-turbo"))

Cost of input: 0.0223465; Cost of output: 0.0670395


***


## Splitting Text to Chunks

In [20]:

from langchain_community.document_loaders import PyPDFLoader
import tempfile

In [21]:
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
    tmp.write(sample_pdf)
    temp_pdf_path = tmp.name

In [22]:
loader = PyPDFLoader(temp_pdf_path)

In [23]:
pages = loader.load_and_split()

In [24]:
len(pages)

82

In [25]:
len(pages[0].page_content)

3329

## Indexing: Split

In [26]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [27]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)


In [28]:

all_splits = text_splitter.split_documents(pages)

In [29]:
len(all_splits), len(all_splits[0].page_content), all_splits[1].metadata

(280, 941, {'source': '/tmp/tmp7c0mghe_.pdf', 'page': 0, 'start_index': 889})

In [30]:
os.remove(temp_pdf_path)

***

## Set Up Database

In [31]:
import psycopg2
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values

In [33]:
connection_string = f"postgresql://postgres:{POSTGRES_PASSWORD}@localhost:5432/postgres"

'postgresql://postgres:postgres@localhost:5432/postgres'

In [34]:
conn = psycopg2.connect(connection_string)
cur = conn.cursor()
cur.execute("SELECT NOW()")
result = cur.fetchone()

In [35]:
result

(datetime.datetime(2024, 3, 3, 10, 53, 51, 402591, tzinfo=datetime.timezone.utc),)

In [36]:

cur.execute('CREATE EXTENSION IF NOT EXISTS vector')
register_vector(conn)

In [38]:
cur.execute("""
    CREATE TABLE IF NOT EXISTS document (
        noticeID TEXT PRIMARY KEY,
        title TEXT,
        solicitationNumber TEXT,
        naicsCode INT,
        content TEXT
    );
""")
conn.commit()

In [41]:
conn.rollback()

In [42]:
cur.execute("""
    CREATE TABLE IF NOT EXISTS resource_links(
        linkId SERIAL PRIMARY KEY,
        noticeId TEXT REFERENCES document(noticeId),
        url TEXT
    );
""")
conn.commit()

In [None]:
cur.execute("""
    CREATE TABLE IF NOT EXISTS document_chunk (
        id BIGSERIAL PRIMARY KEY,
        noticeId TEXT REFERENCES document(noticeId),
        token_count INT,
        chunk_text TEXT,
        embedding vector(1536)
    );
""")