In [1]:
import sys
backend_path = '../backend'
if backend_path not in sys.path:
        sys.path.append(backend_path)

In [2]:
from sqlalchemy import create_engine, select, values, update, and_, or_
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
from app.models.models import Notice, ResourceLink
from app.models.schema import NoticeBase, ResourceLinkBase
import pendulum
import tempfile
import requests
import os
import logging
import os
from zipfile import BadZipfile
import re

import textract


logger = logging.getLogger(__name__)

In [3]:
load_dotenv()
DATABASE_URL = "postgresql+psycopg2://airflow:airflow@localhost:5432/airflow"

In [4]:
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
selected_date = pendulum.now("utc").subtract(days=1).strftime("%Y%m%d")

In [5]:
def get_doc_text(file_name, rm=True):
    """Textract a doc given its path

    Arguments:
        file_name {str} -- path to a doc
    """
    try:
        b_text = None
        try:
            b_text = textract.process(file_name, encoding="utf-8", errors="ignore")
        # ShellError with antiword occurs when an rtf is saved with a doc extension
        except textract.exceptions.ShellError as e:
            err_message = str(e)
            try:
                if "antiword" in err_message and file_name.endswith(".doc"):
                    new_name = file_name.replace(".doc", ".rtf")
                    os.rename(file_name, new_name)
                    b_text = textract.process(
                        new_name, encoding="utf-8", errors="ignore"
                    )
            except textract.exceptions.ShellError as ex:
                logger.error(
                    "Error extracting text from a DOC file. Check that all dependencies of textract are installed.\n{}".format(
                        ex
                    )
                )
        except textract.exceptions.MissingFileError as e:
            b_text = None
            logger.error(
                f"Couldn't textract {file_name} since the file couldn't be found: {e}",
                exc_info=True,
            )
        # This can be raised when a pdf is incorrectly saved as a .docx (GH183)
        except BadZipfile as e:
            if file_name.endswith(".docx"):
                new_name = file_name.replace(".docx", ".pdf")
                os.rename(file_name, new_name)
                b_text = textract.process(
                    new_name, encoding="utf-8", method="pdftotext", errors="ignore"
                )
            else:
                b_text = None
                logger.warning(
                    f"Exception occurred textracting {file_name}: {e}", exc_info=True
                )
        # TypeError is raised when None is passed to str.decode()
        # This happens when textract can't extract text from scanned documents
        except TypeError:
            b_text = None
        except Exception as e:
            if re.match("^(.*) file; not supported", str(e)):
                logger.warning(f"'{file_name}' is type {str(e)}")
            elif re.match("^The filename extension .zip is not yet supported", str(e)):
                logger.warning(
                    f"'{file_name}' is type zip and not supported by textract"
                )
            else:
                logger.warning(
                    f"Exception occurred textracting {file_name}: {e}", exc_info=True
                )
            b_text = None
        text = b_text.decode("utf8", errors="ignore").strip() if b_text else ""
        if rm:
            try:
                os.remove(file_name)
            except Exception as e:
                logger.error(f"{e}Unable to remove {file_name}", exc_info=True)
            finally:
                return text

    except Exception as e:
        logger.error(
            f"Error uncaught when trying to parse file {file_name}. Giving up and returning an empty string. {e}",
            exc_info=True,
        )
        text = ""

    return text

In [6]:
with SessionLocal() as session:
    subquery = (
        select(ResourceLink.notice_id).
        distinct()
    )
    stmt = (
        select(ResourceLink).
        where(and_(ResourceLink.notice_id.in_(subquery),
        ResourceLink.text.is_(None) 
        )).limit(20)
    )
    results = session.execute(stmt).scalars().all()
    resource_links = [ResourceLinkBase.model_validate(result).dict() for result in results]

In [7]:
resource_links[:2]

[{'id': 53,
  'url': 'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/979dabe2bcaa4019b902e4397f5c9a59/download?api_key=null&token=',
  'text': None},
 {'id': 54,
  'url': 'https://sam.gov/api/prod/opps/v3/opportunities/resources/files/cefae3bd5418491f9a2a789a4703ca9c/download?api_key=null&token=',
  'text': None}]

In [8]:
res = requests.get(resource_links[0].get("url"))

In [9]:
res.headers

{'x-amz-id-2': 'bmx6QDaZkt6NoKSR+EYS5z5+4z5ZFqqTjph1ukYai7UVQMaDsMhD2VTStFe1kAxBHKmaVvBafc+Uu1QXFPq43IXEsLG7qY6c00efLuouO7U=', 'x-amz-request-id': 'SBY23N6N6CXK05Q7', 'Date': 'Wed, 13 Mar 2024 12:47:18 GMT', 'Last-Modified': 'Mon, 11 Mar 2024 15:20:21 GMT', 'ETag': '"1aa2b353a8d198277724548e970c907f"', 'x-amz-tagging-count': '2', 'x-amz-server-side-encryption': 'AES256', 'x-amz-version-id': 'Veua3.BTQcOnKkRcnL4keUaJ6Kkxq2sv', 'Content-Disposition': 'attachment; filename=1240LP24Q0021-+Laramie+RD+Trash+Collection.pdf', 'Accept-Ranges': 'bytes', 'Content-Type': 'application/octet-stream', 'Server': 'AmazonS3', 'Content-Length': '901388'}

Getting a redirect response to S3

In [22]:
res

<Response [403]>

In [23]:
res.headers

{'x-amz-request-id': '3ZRGTB4KG6TTT7HZ', 'x-amz-id-2': '26jWc1VZjw6/lFc+8OpSYdM/qWJwTWhsGE/Mtet/icDMQD9owi7/qYntUBkQzTb9xad6TKlhurU=', 'Content-Type': 'application/xml', 'Date': 'Wed, 13 Mar 2024 12:55:54 GMT', 'Server': 'AmazonS3'}

In [13]:
location = res.headers['Location']

In [14]:
res = requests.head(location)
res

<Response [403]>

In [15]:
res.status_code

403

In [18]:
res = requests.head(resource_links[0].get("url"), allow_redirects=True)
res

<Response [403]>

It seems like the entire file will have to be streamed to check for content-length. Will look into alternate methods of optimization after a baseline of the application logic is up and running