In [1]:
import sys
sys.path
backend_path = '../backend'
if backend_path not in sys.path:
    sys.path.append(backend_path)

In [2]:
sys.path

['/home/peter-legion-wsl2/peter-projects/contract-queue/nbs',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages',
 '../backend']

In [3]:
import os
import json
import boto3
from dotenv import load_dotenv
from datetime import datetime
import pendulum

from app.core.config import get_app_settings
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from app.models.models import Base, Notice, PointOfContact, OfficeAddress, PlaceOfPerformance, Link, ResourceLink



In [4]:
load_dotenv()
S3_AWS_ACCESS_KEY_ID = os.environ.get("S3_AWS_ACCESS_KEY_ID")
S3_AWS_SECRET_ACCESS_KEY = os.environ.get("S3_AWS_SECRET_ACCESS_KEY")
S3_REGION_NAME = os.environ.get("S3_REGION_NAME")

DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow" 

engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
session = SessionLocal()

In [5]:
DATABASE_URL

'postgresql+psycopg2://airflow:airflow@localhost:5432/airflow'

In [6]:

bucket_name = "sam-gov-opportunities"
previous_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")
file_name = f"daily-opportunity-posts/{previous_date}.json" 


In [7]:
s3_client = boto3.client(
    "s3",
    region_name=S3_REGION_NAME,
    aws_access_key_id=S3_AWS_ACCESS_KEY_ID,
    aws_secret_access_key=S3_AWS_SECRET_ACCESS_KEY,
)

In [8]:
s3_response = s3_client.get_object(Bucket=bucket_name, Key=file_name)
s3_response = s3_response['Body'].read().decode('utf-8')
s3_response = json.loads(s3_response)

In [11]:
s3_response[0:2]

[{'noticeId': 'ffe42d665ee64282b2c07f50fffc139a',
  'title': 'Rough River Mowing and Maintenance IDIQ',
  'solicitationNumber': 'W912QR24R0033',
  'fullParentPathName': 'DEPT OF DEFENSE.DEPT OF THE ARMY.USACE.LRD.W072 ENDIST LOUISVILLE',
  'fullParentPathCode': '021.2100.USACE.LRD.W912QR',
  'postedDate': '2024-03-07',
  'type': 'Solicitation',
  'baseType': 'Solicitation',
  'archiveType': 'autocustom',
  'archiveDate': '2024-03-22',
  'typeOfSetAsideDescription': None,
  'typeOfSetAside': None,
  'responseDeadLine': '2024-03-22T14:00:00-04:00',
  'naicsCode': '561730',
  'naicsCodes': ['561730'],
  'classificationCode': 'S208',
  'active': 'Yes',
  'award': None,
  'pointOfContact': [{'fax': '',
    'type': 'primary',
    'email': 'thomas.s.nauert@usace.army.mil',
    'phone': '',
    'title': None,
    'fullName': 'THOMAS NAUERT'}],
  'description': 'https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=ffe42d665ee64282b2c07f50fffc139a',
  'organizationType': 'OFFICE',
  'of

In [12]:
def parse_date(iso_str):
    try:
        return datetime.fromisoformat(iso_str.replace('Z', '+00:00'))
    except (TypeError, ValueError):
        return None

for notice_data in s3_response: 
    notice_id = notice_data.get('noticeId')
    exists = session.query(Notice.id).filter_by(id=notice_id).scalar() is not None
    if exists:
        continue # skipping the loop if the noticeId is already in the db

    office_address_data = notice_data.get('officeAddress', {}) 
    office_address = OfficeAddress(zipcode=office_address_data.get('zipcode', None),
                                city=office_address_data.get('city', None),
                                countryCode=office_address_data.get('countryCode, None'),
                                state=office_address_data.get('state', None))
    
    place_of_performance = None 
    place_of_performance_data = notice_data.get('placeOfPerformance')
    if place_of_performance_data:
        place_of_performance = PlaceOfPerformance(city_code=place_of_performance_data.get('city', {}).get('code', None),
                                                city_name=place_of_performance_data.get('city', {}).get('name', None),
                                                state_code=place_of_performance_data.get('state', {}).get('code', None),
                                                state_name=place_of_performance_data.get('state', {}).get('name', None),
                                                country_code=place_of_performance_data.get('country', {}).get('code', None),
                                                country_name=place_of_performance_data.get('country', {}).get('name', None))
    
    notice = Notice(id=notice_data.get('noticeId'),
                title=notice_data.get('title'),
                solicitationNumber=notice_data.get('solicitationNumber'),
                fullParentPathName=notice_data.get('fullParentPathName'),
                fullParentPathCode=notice_data.get('fullParentPathCode'),
                postedDate=parse_date(notice_data.get('postedDate')),
                type=notice_data.get('type'),
                baseType=notice_data.get('baseType'),
                archiveType=notice_data.get('archiveType'),
                archiveDate=parse_date(notice_data.get('archiveDate')),
                typeOfSetAsideDescription=notice_data.get('typeOfSetAsideDescription'),
                typeOfSetAside=notice_data.get('typeOfSetAside'),
                responseDeadLine=parse_date(notice_data.get('responseDeadLine')),
                naicsCode=notice_data.get('naicsCode'),
                naicsCodes=notice_data.get('naicsCodes'),
                classificationCode=notice_data.get('classificationCode'),
                active=notice_data.get('active') == 'Yes',
                description=notice_data.get('description'),
                organizationType=notice_data.get('organizationType'),
                additionalInfoLink=notice_data.get('additionalInfoLink'),
                uiLink=notice_data.get('uiLink'),
                office_address=office_address,
                place_of_performance=place_of_performance)

    poc_data_list = notice_data.get('pointOfContact', [])
    if poc_data:
        for poc_data in poc_data_list:
            poc = PointOfContact(fax=poc_data.get('fax'),
                                type=poc_data.get('type'),
                                email=poc_data.get('email'),
                                phone=poc_data.get('phone'),
                                title=poc_data.get('title'),
                                fullName=poc_data.get('fullName'),
                                notice=notice)
            session.add(poc)
        
    link_data_list = notice_data.get('links', [])
    if link_data_list:
        for link_data in link_data_list:
                    link = Link(rel=link_data.get('rel'), href=link_data.get('href'), notice=notice)
                    session.add(link)

    resource_link_data = notice_data.get('resourceLinks', [])
    if resource_link_data:
        for resource_link in resource_link_data:
                res_link = ResourceLink(url=resource_link, notice=notice)
                session.add(res_link)

    session.add(notice)

session.commit()