In [28]:
from dotenv import load_dotenv
import os
import requests
import json
import re
import fitz
import magic
import pandas as pd
from io import BytesIO
import pendulum
import boto3
import botocore
import magic
import mimetypes
import aiohttp
import asyncio
import time
import uuid
import psycopg2
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values

from langchain_community.document_loaders import PyPDFLoader
import tempfile

from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List, Dict, Any

import tiktoken
from openai import OpenAI

In [2]:
load_dotenv()
SAM_PUBLIC_API_KEY = os.environ.get("SAM_PUBLIC_API_KEY")
S3_AWS_ACCESS_KEY_ID = os.environ.get("S3_AWS_ACCESS_KEY_ID")
S3_AWS_SECRET_ACCESS_KEY = os.environ.get("S3_AWS_SECRET_ACCESS_KEY")
S3_REGION_NAME = os.environ.get("S3_REGION_NAME")
S3_BUCKET_OPPORTUNITIES = os.environ.get("S3_BUCKET_OPPORTUNITIES")

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

bucket_name = 'sam-resource-links'

In [3]:

connection_string = f"postgresql://postgres:{POSTGRES_PASSWORD}@localhost:5432/postgres"
conn = psycopg2.connect(connection_string)
cur = conn.cursor()
cur.execute('CREATE EXTENSION IF NOT EXISTS vector')
register_vector(conn)


In [4]:
cur.execute("""
    CREATE TABLE IF NOT EXISTS solicitations (
        noticeId TEXT PRIMARY KEY,
        title TEXT,
        solicitationNumber TEXT,
        naicsCode INT
    );
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS resource_links(
        linkId SERIAL PRIMARY KEY,
        noticeId TEXT REFERENCES solicitations(noticeId),
        url TEXT,
        blob_url TEXT
    );
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS resource_links_chunks (
        id BIGSERIAL PRIMARY KEY,
        linkId INT REFERENCES resource_links(linkId),
        noticeId TEXT REFERENCES solicitations(noticeId),
        token_count INT,
        chunk_text TEXT,
        embedding vector(1536)
    );
""")

conn.commit()
conn.close()
cur.close()

In [5]:
with open('./data/20240301.json') as f:
    data = json.load(f)

In [6]:
example_sol = data[0]
example_sol

{'noticeId': 'ff75c5fa02564937950a05713afcd835',
 'title': '1202RZ22Q0002 - I-BPA (Incident-Blanket Purchase Agreement) – Portable Toilets and Handwashing Stations',
 'solicitationNumber': '1202RZ22Q0002',
 'fullParentPathName': 'AGRICULTURE, DEPARTMENT OF.FOREST SERVICE.USDA-FS, AT-INCIDENT MGT SVCS BRANCH',
 'fullParentPathCode': '012.12C2.1202RZ',
 'postedDate': '2024-03-01',
 'type': 'Solicitation',
 'baseType': 'Solicitation',
 'archiveType': 'auto15',
 'archiveDate': '2024-03-19',
 'typeOfSetAsideDescription': 'Total Small Business Set-Aside (FAR 19.5)',
 'typeOfSetAside': 'SBA',
 'responseDeadLine': '2024-03-04T13:00:00-07:00',
 'naicsCode': '562991',
 'naicsCodes': ['562991'],
 'classificationCode': 'W045',
 'active': 'Yes',
 'award': None,
 'pointOfContact': [{'fax': None,
   'type': 'primary',
   'email': 'Kenneth.C.Miller@USDA.gov',
   'phone': '385-441-2764',
   'title': None,
   'fullName': 'Kenneth Miller'},
  {'fax': None,
   'type': 'secondary',
   'email': 'Donald.Keev

In [7]:
class Solicitation:
    def __init__(self, data):
        self.data = data
        self.notice_id = data["noticeId"]
        self.title = data["title"]
        self.solicitation_number = data["solicitationNumber"]
        self.naics_code = data["naicsCode"]
        self.naics_codes = data["naicsCodes"]
        self.classification_code = data["classificationCode"]
        self.ui_link = data["uiLink"]
        self.links = data["links"]
        self.resource_links = data["resourceLinks"]
        self.posted_date = data["postedDate"]
        self.formatted_date = "".join(data["postedDate"].split("-"))
        
    async def fetch(self, url, session, params=None):
        async with session.get(url, params=params) as response:
            data = await response.read()
            headers = response.headers
            return data, dict(headers)

    async def fetch_all(self, params=None):
        data_list = []
        headers_list = []
        async with aiohttp.ClientSession() as session:
            tasks = []
            for url in self.resource_links:
                tasks.append(self.fetch(url, session, params))
            results = await asyncio.gather(*tasks)
            for data, headers in results:
                data_list.append(data)
                headers_list.append(headers)

        return data_list, headers_list

    async def get_attachments(self, sam_api_key: str):
        params = {
            "api_key": sam_api_key,
        }
        self.attachments, self.headers = await self.fetch_all(params=params)
        self.file_names = [header['Content-Disposition'].split('filename=')[1].strip('"') for header in self.headers]

    def attachments_to_s3(self, bucket_name: str):
        self.object_names: List = []
        s3_client = boto3.client('s3')
        for i in range(len(self.attachments)):
            self.object_names.append(f'{self.formatted_date}/{self.notice_id}/{self.file_names[i]}')
            try:
                s3_client.put_object(
                    Bucket=bucket_name,
                    Key=self.object_names[i],
                    Body=self.attachments[i],
                    Metadata=self.headers[i]
                )
                
            except botocore.exceptions.ClientError as e:
                print(f"Error: {e}")
                return False

        return f"Wrote {len(self.attachments)} to {bucket_name} S3 bucket."
    


In [8]:
sol_instance = Solicitation(example_sol)
sol_instance.formatted_date

'20240301'

In [9]:
await sol_instance.get_attachments(SAM_PUBLIC_API_KEY)

In [10]:
sol_instance.attachments_to_s3(bucket_name)

'Wrote 2 to sam-resource-links S3 bucket.'

In [11]:
sol_instance.object_names

['20240301/ff75c5fa02564937950a05713afcd835/2024+Onboarding+Package+for+1202RZ22Q0002-005+-+New+Vendor+1449.pdf',
 '20240301/ff75c5fa02564937950a05713afcd835/2024+Annual+Review+Package+for+1202RZ22Q0002-005+-+Current+Vendor+SF30.pdf']

In [12]:
sql = """
INSERT INTO solicitations (noticeId, title, solicitationNumber, naicsCode)
VALUES (%s, %s, %s, %s)
ON CONFLICT (noticeID) DO NOTHING
"""

In [13]:

conn = psycopg2.connect(connection_string)
cur = conn.cursor()

for entry in data:
    params = (
        entry['noticeId'],
        entry['title'],
        entry['solicitationNumber'],
        entry['naicsCode']
    )
 
    try:
        cur.execute(sql, params)
        conn.commit()
    except psycopg2.Error as e:
        print(f"Database error {e}")
        conn.rollback()
cur.close()
conn.close()

In [14]:
sql = """
INSERT INTO resource_links(noticeId, url, blob_url) 
VALUES (%s, %s, %s)
ON CONFLICT (linkId) DO NOTHING
"""

In [15]:

conn = psycopg2.connect(connection_string)
cur = conn.cursor()

for i in range(len(sol_instance.attachments)):
    params = (
        sol_instance.notice_id,
        sol_instance.resource_links[i],
        sol_instance.object_names[i],
    )
 
    try:
        cur.execute(sql, params)
        conn.commit()
    except psycopg2.Error as e:
        print(f"Database error {e}")
        conn.rollback()
cur.close()
conn.close()

In [16]:
example_pdf = sol_instance.attachments[0]

In [17]:
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
    tmp.write(example_pdf)
    temp_pdf_path = tmp.name

loader = PyPDFLoader(temp_pdf_path)
pages = loader.load_and_split()
os.remove(temp_pdf_path)
len(pages)

82

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

In [19]:
all_splits = text_splitter.split_documents(pages)

In [20]:
len(all_splits)

280

In [21]:
sql = f"""select * from resource_links where blob_url = '{sol_instance.object_names[0]}'"""

In [22]:
sql

"select * from resource_links where blob_url = '20240301/ff75c5fa02564937950a05713afcd835/2024+Onboarding+Package+for+1202RZ22Q0002-005+-+New+Vendor+1449.pdf'"

In [23]:
with psycopg2.connect(connection_string) as conn:
    with conn.cursor() as cur:
        cur.execute(sql)
        rows = cur.fetchall()

In [24]:
rows[0][0]

1

In [25]:
sql = f"""
INSERT INTO resource_links_chunks(linkId, noticeId, token_count, chunk_text)
SELECT linkId, %s, %s, %s
FROM resource_links
WHERE blob_url = '{sol_instance.object_names[0]}'
ON CONFLICT (id) DO NOTHING
"""

In [26]:
all_splits[0]

Document(page_content='SEE ADDENDUM IS CHECKEDCODE 18a. PAYMENT WILL BE MADE BY\nCODE FACILITY CODE \n17b. \nCHECK IF REMITTANCE IS DIFFERENT AND PUT SUCH ADDRESS IN OFFER OFFEROR02RZ\nBOISE ID 83705-53543833 S DEVELOPMENT AVEUSDA-FS AT-INCIDENT MGT SVCS BRANC02RZCODE 16. ADMINISTERED BY CODE XXX\n562991\nSIZE STANDARD: 100.00 % FOR: SET  ASIDE: UNRESTRICTED OR 02RZ\nREQUEST FOR \nPROPOSAL \n(RFP)INVITATION \nFOR BID (IFB)10. THIS ACQUISITION IS CODE \nREQUEST FOR \nQUOTE (RFQ)14. METHOD OF SOLICITATION13b. RATINGNORTH AMERICAN INDUSTRY \nCLASSIFICATION STANDARD \n(NAICS):SMALL BUSINESS\n03/02/2024 2359 MT02/01/2024\n385-441-2764 KENNETH MILLER0001\n(No collect calls)\nINFORMATION CALL:FOR SOLICIT\nATION8. OFFER DUE DA TE/LOCAL TIME b. TELEPHONE  NUMBER a. NAME4. ORDER NUMBER 3. AWARD/ 6. SOLICITATION 1202RZ22Q00025. SOLICITATION NUMBERSOLICITATION/CONTRACT/ORDER FOR COMMERCIAL  ITEMS1. REQUISITION NUMBER PAGE     OF\n1 62 OFFEROR TO COMPLETE BLOCKS 12, 17, 23, 24, & 30', metadata={'so

In [27]:
with psycopg2.connect(connection_string) as conn:
    with conn.cursor() as cur:
        for chunk in all_splits:
            enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
            token_count = len(enc.encode(chunk.page_content))

            params = (
                sol_instance.notice_id,
                token_count,
                chunk.page_content
            )
            cur.execute(sql, params)
            conn.commit()



In [29]:
client = OpenAI()

In [61]:
with psycopg2.connect(connection_string) as conn:
    with conn.cursor() as cur:
        cur.execute("SELECT chunk_text FROM resource_links_chunks LIMIT 10")
        rows = cur.fetchall()
        for row in rows:
            response = client.embeddings.create(input=row[0], model="text-embedding-3-small")
            cur.execute(
                "UPDATE resource_links_chunks SET embedding = %s WHERE chunk_text = %s",
            (response.data[0].embedding, row[0])
            )

In [52]:
with psycopg2.connect(connection_string) as conn:
    with conn.cursor() as cur:
        cur.execute("SELECT chunk_text FROM resource_links_chunks LIMIT 10")
        rows = cur.fetchall()

In [59]:
len(rows)

10

In [41]:
def get_nearest(query_str):
    response = client.embeddings.create(input=rows[0][0], model="text-embedding-3-small")
    with conn.cursor() as cur:
        cur.execute(
            """
        SELECT * FROM resource_links_chunks ORDER BY embedding <=> %s LIMIT 5;
        """,
            (embedding_array,),
        )
        return cur.fetchall()

[0.002621516352519393,
 -0.024668468162417412,
 0.015348977409303188,
 0.02692297101020813,
 -0.04671541973948479,
 -0.015925711020827293,
 -0.05237789452075958,
 0.003968320321291685,
 0.003022935939952731,
 -0.03355540707707405,
 0.05208952724933624,
 -0.028024008497595787,
 -0.018888024613261223,
 0.0018088462529703975,
 0.06265423446893692,
 0.024222809821367264,
 0.03190385177731514,
 -0.02692297101020813,
 -0.04417254775762558,
 0.020657548680901527,
 0.016830135136842728,
 0.03560018911957741,
 -0.02705404721200466,
 -0.016109216958284378,
 -0.008867278695106506,
 -0.04393661394715309,
 -0.03536425530910492,
 -0.01572909764945507,
 0.03038337267935276,
 -0.03423700109124184,
 0.019661372527480125,
 -0.03463022783398628,
 0.009273613803088665,
 0.02481265179812908,
 -0.04566681385040283,
 0.041393741965293884,
 -0.012557062320411205,
 0.01962204836308956,
 -0.02171926200389862,
 -0.03334568813443184,
 -0.009935546666383743,
 -0.045797888189554214,
 0.028181299567222595,
 0.010361