In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)


**- [Download all attachments as zip](https://open.gsa.gov/api/opportunities-api/#download-all-attachments-as-zip-for-an-opportunity)**

In [35]:

from sqlalchemy import create_engine, select, values, update, and_
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
import pendulum
import tempfile
import requests
import os

In [3]:
load_dotenv()
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=2).strftime("%Y%m%d")

Get dicts of opportunity listings

In [5]:
with SessionLocal() as session:
    stmt = (
        select(Notice).where(Notice.postedDate == selected_date) 
        
    )
    results = session.execute(stmt).scalars().all()
    opportunities = [NoticeBase.model_validate(result).dict() for result in results]

In [6]:
opportunities

[{'id': 'e7b71f84b8e1485dbc3c96913bd80050',
  'title': 'BOBBIN / ARRAY PROBE',
  'solicitationNumber': 'SPMYM2244Q0752',
  'fullParentPathName': 'DEPT OF DEFENSE.DEFENSE LOGISTICS AGENCY.DLA MARITIME.DLA MARITIME SHIPYARDS.DLA MARITIME - PUGET SOUND',
  'fullParentPathCode': '097.97AS.DLA MARITIME.DLA MARITIME SHIPYDS.SPMYM2',
  'postedDate': datetime.datetime(2024, 3, 9, 0, 0),
  'type': 'Solicitation',
  'baseType': 'Solicitation',
  'archiveType': 'auto15',
  'archiveDate': datetime.datetime(2024, 3, 23, 0, 0),
  'typeOfSetAsideDescription': 'Total Small Business Set-Aside (FAR 19.5)',
  'typeOfSetAside': 'SBA',
  'responseDeadLine': datetime.datetime(2024, 3, 8, 18, 0),
  'naicsCode': {'id': 402,
   'naicsCode': 334513,
   'title': 'Instruments and Related Products Manufacturing for Measuring, Displaying, and Controlling Industrial Process Variables',
   'description': 'This U.S. industry comprises establishments primarily engaged in manufacturing instruments and related devices fo

In [7]:
ex_link = opportunities[2].get("resource_links")[0].get("url")

In [8]:
ex_link

'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/b19711ed9e314378a7620a789226c9d6/download?api_key=null&token='

In [9]:
res = requests.get(ex_link)
res.status_code

200

In [10]:
file_name = res.headers['Content-Disposition'].split('filename=')[1].strip('"')

In [11]:
file_name

'SPMYM224Q1010.pdf'

In [12]:
with tempfile.NamedTemporaryFile(delete=False) as tmp:
    tmp.write(res.content)
    temp_path = tmp.name
    

In [13]:
tmp.name

'/tmp/tmpyna1a0sj'

In [14]:
tmp.name

'/tmp/tmpyna1a0sj'

In [15]:
temp_file_size = os.path.getsize(temp_path)
temp_file_size / 1000000


1.173888

Code below from **[srt-fbo-scraper](https://github.com/GSA/srt-fbo-scraper/blob/4e61da5bb518c572f5e45bbe849df60520c4abad/src/fbo_scraper/get_doc_text.py)**

In [16]:
import logging
import os
from zipfile import BadZipfile
import re

import textract

logger = logging.getLogger(__name__)


def get_doc_text(file_name, rm=True):
    """Textract a doc given its path

    Arguments:
        file_name {str} -- path to a doc
    """
    try:
        b_text = None
        try:
            b_text = textract.process(file_name, encoding="utf-8", errors="ignore")
        # ShellError with antiword occurs when an rtf is saved with a doc extension
        except textract.exceptions.ShellError as e:
            err_message = str(e)
            try:
                if "antiword" in err_message and file_name.endswith(".doc"):
                    new_name = file_name.replace(".doc", ".rtf")
                    os.rename(file_name, new_name)
                    b_text = textract.process(
                        new_name, encoding="utf-8", errors="ignore"
                    )
            except textract.exceptions.ShellError as ex:
                logger.error(
                    "Error extracting text from a DOC file. Check that all dependencies of textract are installed.\n{}".format(
                        ex
                    )
                )
        except textract.exceptions.MissingFileError as e:
            b_text = None
            logger.error(
                f"Couldn't textract {file_name} since the file couldn't be found: {e}",
                exc_info=True,
            )
        # This can be raised when a pdf is incorrectly saved as a .docx (GH183)
        except BadZipfile as e:
            if file_name.endswith(".docx"):
                new_name = file_name.replace(".docx", ".pdf")
                os.rename(file_name, new_name)
                b_text = textract.process(
                    new_name, encoding="utf-8", method="pdftotext", errors="ignore"
                )
            else:
                b_text = None
                logger.warning(
                    f"Exception occurred textracting {file_name}: {e}", exc_info=True
                )
        # TypeError is raised when None is passed to str.decode()
        # This happens when textract can't extract text from scanned documents
        except TypeError:
            b_text = None
        except Exception as e:
            if re.match("^(.*) file; not supported", str(e)):
                logger.warning(f"'{file_name}' is type {str(e)}")
            elif re.match("^The filename extension .zip is not yet supported", str(e)):
                logger.warning(
                    f"'{file_name}' is type zip and not supported by textract"
                )
            else:
                logger.warning(
                    f"Exception occurred textracting {file_name}: {e}", exc_info=True
                )
            b_text = None
        text = b_text.decode("utf8", errors="ignore").strip() if b_text else ""
        if rm:
            try:
                os.remove(file_name)
            except Exception as e:
                logger.error(f"{e}Unable to remove {file_name}", exc_info=True)
            finally:
                return text

    except Exception as e:
        logger.error(
            f"Error uncaught when trying to parse file {file_name}. Giving up and returning an empty string. {e}",
            exc_info=True,
        )
        text = ""

    return text

In [17]:
temp_path

'/tmp/tmpyna1a0sj'

In [18]:
text = get_doc_text(temp_path, rm=False)

Exception occurred textracting /tmp/tmpyna1a0sj: 'utf-8' codec can't decode byte 0x9c in position 147: invalid start byte
Traceback (most recent call last):
  File "/tmp/ipykernel_93141/1117094731.py", line 20, in get_doc_text
    b_text = textract.process(file_name, encoding="utf-8", errors="ignore")
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/__init__.py", line 79, in process
    return parser.process(filename, input_encoding, output_encoding, **kwargs)
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/utils.py", line 46, in process
    byte_string = self.extract(filename, **kwargs)
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/txt_parser.py", line 9, in extract
    return stream.read()
  File "/usr/lib/python3.10/codecs.py", line 322, in decode
    (result, consumed) = self._buffer_de

In [19]:
text_bytes = text.encode('utf-8')

In [20]:
len(text_bytes) / 1000000

0.0

In [21]:
.17 * 3000

510.00000000000006

***

In [22]:
def get_file_name(res):
    file_name = res.headers['Content-Disposition'].split('filename=')[1].strip('"')
    return file_name

In [23]:
with SessionLocal() as session:
    stmt = select(ResourceLink)
    results = session.execute(stmt).scalars().all()

In [24]:
with SessionLocal() as session:
    stmt = (
        select(ResourceLink).where(ResourceLink.text.is_(None))    
    )
    results = session.execute(stmt).scalars().all()
    resource_links = [ResourceLinkBase.model_validate(result).dict() for result in results]
    

In [25]:
prefix, suffix = get_file_name(res).split('.')
suffix = '.' + suffix

In [26]:
suffix

'.pdf'

In [27]:
resource_links

[{'id': 8,
  'url': 'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/08010cdd1a4c4e20a2a86d5849964453/download?api_key=null&token=',
  'text': None},
 {'id': 9,
  'url': 'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/2ce50a6d9be54ea38f50f2e9a250c715/download?api_key=null&token=',
  'text': None},
 {'id': 11,
  'url': 'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/656f9fd2269d44b88d23c519a3804569/download?api_key=null&token=',
  'text': None},
 {'id': 13,
  'url': 'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/c4caabf8711541bb8e380f8a3005c6c7/download?api_key=null&token=',
  'text': None}]

In [28]:
res = requests.get(resource_links[1].get("url"))

In [31]:
res

<Response [200]>

In [39]:
with SessionLocal() as session:
    stmt = (
        select(Notice).where(
            and_(
                Notice.id == ResourceLink.notice_id,
                ResourceLink.text.is_(None)
                )
        )
    )
    results = session.execute(stmt).scalars().all()
    result_dict = [NoticeBase.model_validate(result).dict() for result in results]

In [46]:
result_dict[3]

{'id': '50c67c1a7cf04442b9538e49aad9ed67',
 'title': 'ELKO DISTRICT 2024 CATTLE GUARD PURCHASE',
 'solicitationNumber': '140L3924Q0020',
 'fullParentPathName': 'INTERIOR, DEPARTMENT OF THE.BUREAU OF LAND MANAGEMENT.NEVADA STATE OFFICE',
 'fullParentPathCode': '014.1422.140L39',
 'postedDate': datetime.datetime(2024, 3, 9, 0, 0),
 'type': 'Solicitation',
 'baseType': 'Solicitation',
 'archiveType': 'auto15',
 'archiveDate': datetime.datetime(2024, 4, 4, 0, 0),
 'typeOfSetAsideDescription': 'Total Small Business Set-Aside (FAR 19.5)',
 'typeOfSetAside': 'SBA',
 'responseDeadLine': datetime.datetime(2024, 3, 20, 19, 0),
 'naicsCode': {'id': 322,
  'naicsCode': 332312,
  'title': 'Fabricated Structural Metal Manufacturing',
  'description': 'This U.S. industry comprises establishments primarily engaged in fabricating structural metal products, such as assemblies of concrete reinforcing bars and fabricated bar joists.\n'},
 'naicsCodes': ['332312'],
 'classificationCode': '5680',
 'active':

In [None]:
get_file_name(res)

In [None]:
get_file_name(res)

In [None]:
with tempfile.NamedTemporaryFile(delete=False) as tmp:
    tmp.write(res.content)
    temp_path = tmp.name
    

In [None]:
tmp.name

In [None]:
text = get_doc_text(temp_path)

In [None]:
for resource_link in resource_links:
    res = requests.get(resource_link.get("url"))
    file_name = get_file_name(res)
    prefix, suffix = get_file_name(res).split('.')
    suffix = '.' + suffix
    with tempfile.NamedTemporaryFile(prefix=prefix, suffix=suffix, delete=False) as tmp:
        tmp.write(res.content)
        tmp.flush()
        temp_path = tmp.name
        text = get_doc_text(temp_path, rm=True)
        if text:
            # clean null characters before committing
            text = text.replace("\x00", "\uFFFD")
            with SessionLocal() as session:
                stmt = (
                    update(ResourceLink).
                    where(ResourceLink.id == resource_link['id']).
                    values(text=text)
                )
                session.execute(stmt)
                session.commit()

In [None]:
opportunities

**Todo:** Add error handling for filename too long 