In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)


**- [Download all attachments as zip](https://open.gsa.gov/api/opportunities-api/#download-all-attachments-as-zip-for-an-opportunity)**

In [2]:

from sqlalchemy import create_engine, select, values, update, and_, or_
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
import pendulum
import tempfile
import requests
import os
from tqdm import tqdm

In [3]:
load_dotenv()
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

Get dicts of opportunity listings

In [5]:
with SessionLocal() as session:
    stmt = (
        select(Notice).where(Notice.postedDate == selected_date) 
        
    )
    results = session.execute(stmt).scalars().all()
    opportunities = [NoticeBase.model_validate(result).dict() for result in results]

In [6]:
opportunities = opportunities[:]

In [7]:
ex_link = opportunities[2].get("resource_links")[0].get("url")

In [8]:
ex_link

'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/6d8ef216b5194dbcb634ff8176133d95/download?api_key=null&token='

In [9]:
res = requests.get(ex_link)
res.status_code

200

In [10]:
file_name = res.headers['Content-Disposition'].split('filename=')[1].strip('"')

In [11]:
file_name

'SP060424R0411+MSP-ARS+AMD+0001.docx'

In [12]:
with tempfile.NamedTemporaryFile(delete=False) as tmp:
    tmp.write(res.content)
    temp_path = tmp.name
    

In [13]:
tmp.name

'/tmp/tmpziqby2gd'

In [14]:
tmp.name

'/tmp/tmpziqby2gd'

In [15]:
temp_file_size = os.path.getsize(temp_path)
temp_file_size / 1000000


0.052252

Code below from **[srt-fbo-scraper](https://github.com/GSA/srt-fbo-scraper/blob/4e61da5bb518c572f5e45bbe849df60520c4abad/src/fbo_scraper/get_doc_text.py)**

In [16]:
import logging
import os
from zipfile import BadZipfile
import re

import textract

logger = logging.getLogger(__name__)


def get_doc_text(file_name, rm=True):
    """Textract a doc given its path

    Arguments:
        file_name {str} -- path to a doc
    """
    try:
        b_text = None
        try:
            b_text = textract.process(file_name, encoding="utf-8", errors="ignore")
        # ShellError with antiword occurs when an rtf is saved with a doc extension
        except textract.exceptions.ShellError as e:
            err_message = str(e)
            try:
                if "antiword" in err_message and file_name.endswith(".doc"):
                    new_name = file_name.replace(".doc", ".rtf")
                    os.rename(file_name, new_name)
                    b_text = textract.process(
                        new_name, encoding="utf-8", errors="ignore"
                    )
            except textract.exceptions.ShellError as ex:
                logger.error(
                    "Error extracting text from a DOC file. Check that all dependencies of textract are installed.\n{}".format(
                        ex
                    )
                )
        except textract.exceptions.MissingFileError as e:
            b_text = None
            logger.error(
                f"Couldn't textract {file_name} since the file couldn't be found: {e}",
                exc_info=True,
            )
        # This can be raised when a pdf is incorrectly saved as a .docx (GH183)
        except BadZipfile as e:
            if file_name.endswith(".docx"):
                new_name = file_name.replace(".docx", ".pdf")
                os.rename(file_name, new_name)
                b_text = textract.process(
                    new_name, encoding="utf-8", method="pdftotext", errors="ignore"
                )
            else:
                b_text = None
                logger.warning(
                    f"Exception occurred textracting {file_name}: {e}", exc_info=True
                )
        # TypeError is raised when None is passed to str.decode()
        # This happens when textract can't extract text from scanned documents
        except TypeError:
            b_text = None
        except Exception as e:
            if re.match("^(.*) file; not supported", str(e)):
                logger.warning(f"'{file_name}' is type {str(e)}")
            elif re.match("^The filename extension .zip is not yet supported", str(e)):
                logger.warning(
                    f"'{file_name}' is type zip and not supported by textract"
                )
            else:
                logger.warning(
                    f"Exception occurred textracting {file_name}: {e}", exc_info=True
                )
            b_text = None
        text = b_text.decode("utf8", errors="ignore").strip() if b_text else ""
        if rm:
            try:
                os.remove(file_name)
            except Exception as e:
                logger.error(f"{e}Unable to remove {file_name}", exc_info=True)
            finally:
                return text

    except Exception as e:
        logger.error(
            f"Error uncaught when trying to parse file {file_name}. Giving up and returning an empty string. {e}",
            exc_info=True,
        )
        text = ""

    return text

In [17]:
temp_path

'/tmp/tmpziqby2gd'

In [18]:
text = get_doc_text(temp_path, rm=False)

Exception occurred textracting /tmp/tmpziqby2gd: 'utf-8' codec can't decode byte 0xdf in position 15: invalid continuation byte
Traceback (most recent call last):
  File "/tmp/ipykernel_60462/1117094731.py", line 20, in get_doc_text
    b_text = textract.process(file_name, encoding="utf-8", errors="ignore")
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/__init__.py", line 79, in process
    return parser.process(filename, input_encoding, output_encoding, **kwargs)
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/utils.py", line 46, in process
    byte_string = self.extract(filename, **kwargs)
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/txt_parser.py", line 9, in extract
    return stream.read()
  File "/usr/lib/python3.10/codecs.py", line 322, in decode
    (result, consumed) = self._buf

In [19]:
text_bytes = text.encode('utf-8')

In [20]:
len(text_bytes) / 1000000

0.0

In [21]:
.17 * 3000

510.00000000000006

***

In [22]:
def get_file_name(res):
    file_name = res.headers['Content-Disposition'].split('filename=')[1].strip('"')
    return file_name

In [23]:
with SessionLocal() as session:
    stmt = select(ResourceLink)
    results = session.execute(stmt).scalars().all()

In [24]:
with SessionLocal() as session:
    stmt = (
        select(ResourceLink).where(ResourceLink.text.is_(None))    
    )
    results = session.execute(stmt).scalars().all()
    resource_links = [ResourceLinkBase.model_validate(result).dict() for result in results]
    

In [37]:
with SessionLocal() as session:
    subquery = (
        select(ResourceLink.notice_id).
        distinct()
    )
    stmt = (
        select(ResourceLink).
        where(and_(ResourceLink.notice_id.in_(subquery),
        ResourceLink.text.is_(None) 
        )).limit(20)
    )
    results = session.execute(stmt).scalars().all()
    resource_links = [ResourceLinkBase.model_validate(result).dict() for result in results]

In [38]:
len(resource_links)

20

In [39]:
res = requests.get(resource_links[1].get("url"))

In [40]:
res

<Response [200]>

In [41]:
with SessionLocal() as session:
    stmt = (
        select(Notice).where(
            and_(
                Notice.id == ResourceLink.notice_id,
                ResourceLink.text.is_(None)
                )
        )
    )
    results = session.execute(stmt).scalars().all()
    result_dict = [NoticeBase.model_validate(result).dict() for result in results]

In [42]:
get_file_name(res)

'A22.W912EK24B0008+-+CLINTON+PL84-99+-+FINAL+PLAN+SET.pdf'

In [43]:
get_file_name(res)

'A22.W912EK24B0008+-+CLINTON+PL84-99+-+FINAL+PLAN+SET.pdf'

In [44]:
with tempfile.NamedTemporaryFile(delete=False) as tmp:
    tmp.write(res.content)
    temp_path = tmp.name
    

In [45]:
tmp.name

'/tmp/tmpoeunh07a'

In [46]:
text = get_doc_text(temp_path)

Exception occurred textracting /tmp/tmpoeunh07a: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte
Traceback (most recent call last):
  File "/tmp/ipykernel_60462/1117094731.py", line 20, in get_doc_text
    b_text = textract.process(file_name, encoding="utf-8", errors="ignore")
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/__init__.py", line 79, in process
    return parser.process(filename, input_encoding, output_encoding, **kwargs)
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/utils.py", line 46, in process
    byte_string = self.extract(filename, **kwargs)
  File "/home/peter-legion-wsl2/peter-projects/contract-queue/.venv/lib/python3.10/site-packages/textract/parsers/txt_parser.py", line 9, in extract
    return stream.read()
  File "/usr/lib/python3.10/codecs.py", line 322, in decode
    (result, consumed) = self._buf

Create a batch of 20 to process

In [47]:
for resource_link in tqdm(resource_links):
    res = requests.get(resource_link.get("url"))
    file_name = get_file_name(res)
    print(file_name)
    prefix, suffix = os.path.splitext(get_file_name(res))
    suffix = '.' + suffix
    with tempfile.NamedTemporaryFile(prefix=prefix, suffix=suffix, delete=False) as tmp:
        tmp.write(res.content)
        tmp.flush()
        temp_path = tmp.name
        text = get_doc_text(temp_path, rm=True)
        with SessionLocal() as session:
            if text:
                # clean null characters before committing
                text = text.replace("\x00", "\uFFFD")
                stmt = (
                    update(ResourceLink).
                    where(ResourceLink.id == resource_link['id']).
                    values(text=text)
                )
                session.execute(stmt)
                session.commit()
            else:
                stmt = (
                    update(ResourceLink).
                    where(ResourceLink.id == resource_link['id']).
                    values(text="unparsable")
                )
                session.execute(stmt)
                session.commit()

  0%|          | 0/20 [00:00<?, ?it/s]

A22.W912EK24B0008+-+CLINTON+PL84-99+-+FINAL+PLAN+SET_AMND+0001_REVISED+C-301_11MAR2024.pdf


  5%|▌         | 1/20 [00:02<00:46,  2.44s/it]

A22.W912EK24B0008+-+CLINTON+PL84-99+-+FINAL+PLAN+SET.pdf


 10%|█         | 2/20 [00:16<02:50,  9.50s/it]

A22.W912EK24B0008+-+CLINTON+PL84-99+-+FINAL+SOLICITATION+W+SPECS_REVISED+11MAR2024.pdf


 15%|█▌        | 3/20 [00:29<03:09, 11.15s/it]

A22.W912EK24B0008+-+CLINTON+PL84-99+-+AMENDMENT+0001_11MAR2024.pdf


 20%|██        | 4/20 [00:30<01:52,  7.05s/it]

WD+2015-5631+Rev+20+dated+12-26-2023.pdf


 30%|███       | 6/20 [00:32<00:48,  3.46s/it]

36C26124Q0411_1.docx


 35%|███▌      | 7/20 [00:33<00:33,  2.55s/it]

36C25224Q0240_1.docx


 40%|████      | 8/20 [00:33<00:22,  1.88s/it]

36C25224Q0240+0002.docx


 45%|████▌     | 9/20 [00:34<00:15,  1.45s/it]

36C25224Q0240+0001.docx
Solicitation-+Global+Fund+Liaison_Resolicit_Final_.pdf


 50%|█████     | 10/20 [00:36<00:16,  1.62s/it]

AMENDMENT+1+12444624Q0034.pdf


 60%|██████    | 12/20 [00:37<00:09,  1.15s/it]

Attachment+B+CSE+SOW+and+Specifications+WRD+2024.docx


 65%|██████▌   | 13/20 [00:38<00:06,  1.06it/s]

Attachment+C+Exhibits+1+thru+6+WRD+2024.docx


 70%|███████   | 14/20 [00:38<00:04,  1.29it/s]

Attachment+D+Submission+Package.docx
Attachment+E+List+of+stands+by+bid+Item.pdf


 80%|████████  | 16/20 [00:40<00:03,  1.13it/s]

Attachment+G+81-1253.txt


 85%|████████▌ | 17/20 [00:41<00:02,  1.35it/s]

Attachment+A+schedule+of+items+Drum+East+FY24+CSE.docx
12444624Q0034+combined+doc.pdf


 90%|█████████ | 18/20 [00:43<00:02,  1.37s/it]

Attachment+F+Drum+East+FY24+all+maps.pdf


 95%|█████████▌| 19/20 [00:49<00:02,  2.79s/it]

WD+2015-5405+Rev+21+Date+12-26-2023.pdf


100%|██████████| 20/20 [00:53<00:00,  2.67s/it]


In [None]:
opportunities

**Todo:** Add error handling for filename too long 