In [37]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)


**- [Download all attachments as zip](https://open.gsa.gov/api/opportunities-api/#download-all-attachments-as-zip-for-an-opportunity)**

In [38]:

from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice
from app.models.schema import NoticeBase
import pendulum
import tempfile
import requests
import os

In [39]:
load_dotenv()
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [40]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=2).strftime("%Y%m%d")

Get dicts of opportunity listings

In [41]:
with SessionLocal() as session:
    stmt = (
        select(Notice).where(Notice.postedDate == selected_date) 
        
    )
    results = session.execute(stmt).scalars().all()
    opportunities = [NoticeBase.model_validate(result).dict() for result in results]

In [42]:
opportunities

[{'id': 'e7b71f84b8e1485dbc3c96913bd80050',
  'title': 'BOBBIN / ARRAY PROBE',
  'solicitationNumber': 'SPMYM2244Q0752',
  'fullParentPathName': 'DEPT OF DEFENSE.DEFENSE LOGISTICS AGENCY.DLA MARITIME.DLA MARITIME SHIPYARDS.DLA MARITIME - PUGET SOUND',
  'fullParentPathCode': '097.97AS.DLA MARITIME.DLA MARITIME SHIPYDS.SPMYM2',
  'postedDate': datetime.datetime(2024, 3, 9, 0, 0),
  'type': 'Solicitation',
  'baseType': 'Solicitation',
  'archiveType': 'auto15',
  'archiveDate': datetime.datetime(2024, 3, 23, 0, 0),
  'typeOfSetAsideDescription': 'Total Small Business Set-Aside (FAR 19.5)',
  'typeOfSetAside': 'SBA',
  'responseDeadLine': datetime.datetime(2024, 3, 8, 18, 0),
  'naicsCode': {'id': 402,
   'naicsCode': 334513,
   'title': 'Instruments and Related Products Manufacturing for Measuring, Displaying, and Controlling Industrial Process Variables',
   'description': 'This U.S. industry comprises establishments primarily engaged in manufacturing instruments and related devices fo

In [43]:
ex_link = opportunities[2].get("resource_links")[0].get("url")

In [44]:
ex_link

'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/b19711ed9e314378a7620a789226c9d6/download?api_key=null&token='

In [45]:
res = requests.get(ex_link)
res.status_code

200

In [46]:
file_name = res.headers['Content-Disposition'].split('filename=')[1].strip('"')

In [47]:
file_name

'SPMYM224Q1010.pdf'

In [48]:
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
    tmp.write(res.content)
    temp_path = tmp.name
    

In [49]:
temp_file_size = os.path.getsize(temp_path)
temp_file_size / 1000000


1.173888

Code below from **[srt-fbo-scraper](https://github.com/GSA/srt-fbo-scraper/blob/4e61da5bb518c572f5e45bbe849df60520c4abad/src/fbo_scraper/get_doc_text.py)**

In [50]:
import logging
import os
from zipfile import BadZipfile
import re

import textract

logger = logging.getLogger(__name__)


def get_doc_text(file_name, rm=True):
    """Textract a doc given its path

    Arguments:
        file_name {str} -- path to a doc
    """
    try:
        b_text = None
        try:
            b_text = textract.process(file_name, encoding="utf-8", errors="ignore")
        # ShellError with antiword occurs when an rtf is saved with a doc extension
        except textract.exceptions.ShellError as e:
            err_message = str(e)
            try:
                if "antiword" in err_message and file_name.endswith(".doc"):
                    new_name = file_name.replace(".doc", ".rtf")
                    os.rename(file_name, new_name)
                    b_text = textract.process(
                        new_name, encoding="utf-8", errors="ignore"
                    )
            except textract.exceptions.ShellError as ex:
                logger.error(
                    "Error extracting text from a DOC file. Check that all dependencies of textract are installed.\n{}".format(
                        ex
                    )
                )
        except textract.exceptions.MissingFileError as e:
            b_text = None
            logger.error(
                f"Couldn't textract {file_name} since the file couldn't be found: {e}",
                exc_info=True,
            )
        # This can be raised when a pdf is incorrectly saved as a .docx (GH183)
        except BadZipfile as e:
            if file_name.endswith(".docx"):
                new_name = file_name.replace(".docx", ".pdf")
                os.rename(file_name, new_name)
                b_text = textract.process(
                    new_name, encoding="utf-8", method="pdftotext", errors="ignore"
                )
            else:
                b_text = None
                logger.warning(
                    f"Exception occurred textracting {file_name}: {e}", exc_info=True
                )
        # TypeError is raised when None is passed to str.decode()
        # This happens when textract can't extract text from scanned documents
        except TypeError:
            b_text = None
        except Exception as e:
            if re.match("^(.*) file; not supported", str(e)):
                logger.warning(f"'{file_name}' is type {str(e)}")
            elif re.match("^The filename extension .zip is not yet supported", str(e)):
                logger.warning(
                    f"'{file_name}' is type zip and not supported by textract"
                )
            else:
                logger.warning(
                    f"Exception occurred textracting {file_name}: {e}", exc_info=True
                )
            b_text = None
        text = b_text.decode("utf8", errors="ignore").strip() if b_text else ""
        if rm:
            try:
                os.remove(file_name)
            except Exception as e:
                logger.error(f"{e}Unable to remove {file_name}", exc_info=True)
            finally:
                return text

    except Exception as e:
        logger.error(
            f"Error uncaught when trying to parse file {file_name}. Giving up and returning an empty string. {e}",
            exc_info=True,
        )
        text = ""

    return text

In [51]:
temp_path

'/tmp/tmpav_52o3d.pdf'

In [52]:
text = get_doc_text(temp_path, rm=False)

In [54]:
text_bytes = text.encode('utf-8')

In [56]:
len(text_bytes) / 1000000

0.174696

In [57]:
.17 * 3000

510.00000000000006

**Todo:** Add error handling for filename too long 