In [37]:
from dotenv import load_dotenv
import os
import requests
import json
import re
import fitz
import magic
import pandas as pd
from io import BytesIO
import pendulum
import boto3
import botocore
import magic
import mimetypes
import aiohttp
import asyncio
import time
import uuid
import psycopg2
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values

from langchain_community.document_loaders import PyPDFLoader
import tempfile

from typing import List, Dict, Any

import tiktoken

In [2]:
load_dotenv()
SAM_PUBLIC_API_KEY = os.environ.get("SAM_PUBLIC_API_KEY")
S3_AWS_ACCESS_KEY_ID = os.environ.get("S3_AWS_ACCESS_KEY_ID")
S3_AWS_SECRET_ACCESS_KEY = os.environ.get("S3_AWS_SECRET_ACCESS_KEY")
S3_REGION_NAME = os.environ.get("S3_REGION_NAME")
S3_BUCKET_OPPORTUNITIES = os.environ.get("S3_BUCKET_OPPORTUNITIES")

POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD")

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

bucket_name = 'sam-resource-links'

In [3]:

connection_string = f"postgresql://postgres:{POSTGRES_PASSWORD}@localhost:5432/postgres"
conn = psycopg2.connect(connection_string)
cur = conn.cursor()
cur.execute('CREATE EXTENSION IF NOT EXISTS vector')
register_vector(conn)


In [4]:
cur.execute("""
    CREATE TABLE IF NOT EXISTS solicitations (
        noticeId TEXT PRIMARY KEY,
        title TEXT,
        solicitationNumber TEXT,
        naicsCode INT
    );
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS resource_links(
        linkId SERIAL PRIMARY KEY,
        noticeId TEXT REFERENCES solicitations(noticeId),
        url TEXT,
        blob_url TEXT
    );
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS resource_links_chunks (
        id BIGSERIAL PRIMARY KEY,
        linkId INT REFERENCES resource_links(linkId),
        noticeId TEXT REFERENCES solicitations(noticeId),
        token_count INT,
        chunk_text TEXT,
        embedding vector(1536)
    );
""")

conn.commit()
conn.close()
cur.close()

In [5]:
with open('./data/20240301.json') as f:
    data = json.load(f)

In [6]:
example_sol = data[0]
example_sol

{'noticeId': 'ff75c5fa02564937950a05713afcd835',
 'title': '1202RZ22Q0002 - I-BPA (Incident-Blanket Purchase Agreement) – Portable Toilets and Handwashing Stations',
 'solicitationNumber': '1202RZ22Q0002',
 'fullParentPathName': 'AGRICULTURE, DEPARTMENT OF.FOREST SERVICE.USDA-FS, AT-INCIDENT MGT SVCS BRANCH',
 'fullParentPathCode': '012.12C2.1202RZ',
 'postedDate': '2024-03-01',
 'type': 'Solicitation',
 'baseType': 'Solicitation',
 'archiveType': 'auto15',
 'archiveDate': '2024-03-19',
 'typeOfSetAsideDescription': 'Total Small Business Set-Aside (FAR 19.5)',
 'typeOfSetAside': 'SBA',
 'responseDeadLine': '2024-03-04T13:00:00-07:00',
 'naicsCode': '562991',
 'naicsCodes': ['562991'],
 'classificationCode': 'W045',
 'active': 'Yes',
 'award': None,
 'pointOfContact': [{'fax': None,
   'type': 'primary',
   'email': 'Kenneth.C.Miller@USDA.gov',
   'phone': '385-441-2764',
   'title': None,
   'fullName': 'Kenneth Miller'},
  {'fax': None,
   'type': 'secondary',
   'email': 'Donald.Keev

In [22]:
class Solicitation:
    def __init__(self, data):
        self.data = data
        self.notice_id = data["noticeId"]
        self.title = data["title"]
        self.solicitation_number = data["solicitationNumber"]
        self.naics_code = data["naicsCode"]
        self.naics_codes = data["naicsCodes"]
        self.classification_code = data["classificationCode"]
        self.ui_link = data["uiLink"]
        self.links = data["links"]
        self.resource_links = data["resourceLinks"]
        self.posted_date = data["postedDate"]
        self.formatted_date = "".join(data["postedDate"].split("-"))
        
    async def fetch(self, url, session, params=None):
        async with session.get(url, params=params) as response:
            data = await response.read()
            headers = response.headers
            return data, dict(headers)

    async def fetch_all(self, params=None):
        data_list = []
        headers_list = []
        async with aiohttp.ClientSession() as session:
            tasks = []
            for url in self.resource_links:
                tasks.append(self.fetch(url, session, params))
            results = await asyncio.gather(*tasks)
            for data, headers in results:
                data_list.append(data)
                headers_list.append(headers)

        return data_list, headers_list

    async def get_attachments(self, sam_api_key: str):
        params = {
            "api_key": sam_api_key,
        }
        self.attachments, self.headers = await self.fetch_all(params=params)
        self.file_names = [header['Content-Disposition'].split('filename=')[1].strip('"') for header in self.headers]

    def attachments_to_s3(self, bucket_name: str):
        self.object_names: List = []
        s3_client = boto3.client('s3')
        for i in range(len(self.attachments)):
            self.object_names.append(f'{self.formatted_date}/{self.notice_id}/{self.file_names[i]}')
            try:
                s3_client.put_object(
                    Bucket=bucket_name,
                    Key=self.object_names[i],
                    Body=self.attachments[i],
                    Metadata=self.headers[i]
                )
                
            except botocore.exceptions.ClientError as e:
                print(f"Error: {e}")
                return False

        return f"Wrote {len(self.attachments)} to {bucket_name} S3 bucket."
    


In [23]:
sol_instance = Solicitation(example_sol)
sol_instance.formatted_date

'20240301'

In [24]:
await sol_instance.get_attachments(SAM_PUBLIC_API_KEY)

In [25]:
sol_instance.attachments_to_s3(bucket_name)

'Wrote 2 to sam-resource-links S3 bucket.'

In [26]:
sol_instance.object_names

['20240301/ff75c5fa02564937950a05713afcd835/2024+Onboarding+Package+for+1202RZ22Q0002-005+-+New+Vendor+1449.pdf',
 '20240301/ff75c5fa02564937950a05713afcd835/2024+Annual+Review+Package+for+1202RZ22Q0002-005+-+Current+Vendor+SF30.pdf']

In [12]:
sql = """
INSERT INTO solicitations (noticeId, title, solicitationNumber, naicsCode)
VALUES (%s, %s, %s, %s)
ON CONFLICT (noticeID) DO NOTHING
"""

In [13]:

conn = psycopg2.connect(connection_string)
cur = conn.cursor()

for entry in data:
    params = (
        entry['noticeId'],
        entry['title'],
        entry['solicitationNumber'],
        entry['naicsCode']
    )
 
    try:
        cur.execute(sql, params)
        conn.commit()
    except psycopg2.Error as e:
        print(f"Database error {e}")
        conn.rollback()
cur.close()
conn.close()

In [33]:
sql = """
INSERT INTO resource_links(noticeId, url, blob_url) 
VALUES (%s, %s, %s)
ON CONFLICT (linkId) DO NOTHING
"""

In [34]:

conn = psycopg2.connect(connection_string)
cur = conn.cursor()

for i in range(len(sol_instance.attachments)):
    params = (
        sol_instance.notice_id,
        sol_instance.resource_links[i],
        sol_instance.object_names[i],
    )
 
    try:
        cur.execute(sql, params)
        conn.commit()
    except psycopg2.Error as e:
        print(f"Database error {e}")
        conn.rollback()
cur.close()
conn.close()

In [35]:
example_pdf = sol_instance.attachments[0]

In [40]:
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
    tmp.write(example_pdf)
    temp_pdf_path = tmp.name

loader = PyPDFLoader(temp_pdf_path)
pages = loader.load_and_split()
os.remove(temp_pdf_path)
len(pages)

82

'/tmp/tmp4sdq8f81.pdf'