<a href="https://colab.research.google.com/github/mitchelllierman/kurt/blob/master/kurt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install tabula-py

Collecting tabula-py
  Downloading tabula_py-2.9.0-py3-none-any.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tabula-py
Successfully installed tabula-py-2.9.0


In [2]:
import tabula
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import re
import os
import zipfile
from google.colab import files
import shutil

In [5]:
pdf_filename = "IDEM pdf.pdf"

if os.path.exists(pdf_filename):
    print("PDF file already exists in the current directory.")
else:
    print("PDF file not found in the current directory.")

# Define the URL of the PDF file in the GitHub repo
pdf_url = f"https://raw.githubusercontent.com/mitchelllierman/kurt/master/{pdf_filename}"

if not os.path.exists(pdf_filename):
    # Create the directory if it doesn't exist
    os.makedirs("data", exist_ok=True)

    # Download the PDF file from the GitHub repo
    response = requests.get(pdf_url)

    # Save the PDF file to the data directory
    with open(os.path.join("data", pdf_filename), "wb") as output_file:
        output_file.write(response.content)

    print("PDF file downloaded and saved to the data directory.")

PDF file not found in the current directory.
PDF file downloaded and saved to the data directory.


In [7]:
# Read the PDF and convert it to a CSV file
tabula.convert_into("data/IDEM pdf.pdf", "output.csv", output_format="csv", pages='all')




In [8]:
# Read in the spreadsheet listing sites
bf_data = pd.read_csv('output.csv')

bf_data.rename(columns = {
    "BFD#": "BFD"
}, inplace=True)

bf_city = []
no_result = ""
while len(bf_city) == 0:
  city_query = input(f'{no_result}What city would you like to find sites for? ').strip().capitalize()

  # Read in Brownfield sites by their ID number
  bf_city = bf_data[bf_data['City'] == city_query]
  if len(bf_city) == 0:
    no_result = f"{city_query} returned no results."
  else:
    print(f"{city_query} returned {len(bf_city)} results.")



What city would you like to find sites for? Kouts
Kouts returned no results.What city would you like to find sites for? Evansville
Evansville returned 58 results.


In [9]:

def search_Id(id, count=100):  # Completed
    cabinet_base = "https://ecm.idem.in.gov/cs/"
    oracle_pls = "idcplg?IdcService=GET_SEARCH_RESULTS&QueryText="
    query = f"%3cftx%3e{id}%3c%2fftx%3e"
    sort = "&SortField=xProgram"
    order = "&SortOrder=Desc"
    results = "&ResultCount=100"
    id_search = cabinet_base + oracle_pls + query + sort + order + results
    search_result = requests.get(id_search)

    return search_result



In [10]:

def extract_Data(search_result):

    extracted = {'link': [],
                 'date': [],
                 # YYYY_MM_DD -- requires PARSING
                 'program': [],
                 # Ex Brownfield
                 'doctype': []}
    # Ex Brownfield Completion Document

    # Load the HTML content
    soup = BeautifulSoup(search_result.text, 'html.parser')

    # Find the table
    table = soup.find('table', {'class': 'xuiListTable'})

    if table is None:
        return 1
    header_check = 0
    # Extract data from each row
    for row in table.find_all('tr'):
        cells = row.find_all('td')

        if header_check == 0:
            header_check += 1
            continue

        if len(cells) > 0:
            # Assuming the first column contains links
            loc = cells[0].find('a')['href']
            extracted['link'].append(loc)
            # Assuming the second column contains dates

            date_object = datetime.strptime(cells[2].text.strip(), "%m/%d/%Y")
            formatted_date = date_object.strftime("%Y_%m_%d")

            extracted['date'].append(formatted_date)
            # Assuming the third column contains programs
            extracted['program'].append(cells[4].text)
            # Assuming the fourth column contains document types
            extracted['doctype'].append(cells[6].text)
    # Page Handling
    return extracted



In [11]:

def save_PDFs(site_name, extracted):
    # Open Link
    # Download or Save as PDF

    for i, link in enumerate(extracted['link']):
        print(link)
        pdf = requests.get(link)
        if pdf.status_code == 200:
            date = extracted['date'][i]
            program = extracted['program'][i]
            doc = extracted['doctype'][i]

            # Save the PDF
            fname = f'{city_query}_sites/{site_name}/{date}_{site_name}_{program}_{doc}.pdf'

            with open(fname, 'wb') as file:
                file.write(pdf.content)
        else:
            print(f"Failed to download PDF from {link}")

    # Implement saving pdf with this filename here:
    # curl -o downloaded_file.pdf "{fn[1]}"
    # While making PDF make a txt too?
    # PDF miner



In [12]:

def kurt_Loop(bf_ids):
    os.mkdir(f'{city_query}_sites')

    extraction_errors = []
    try:
        for ind, id in enumerate(bf_ids["BFD"]):
            # Requests the data from IDEM digital file cabinet using requests
            search_result = search_Id(id)

            # Scrape the results pages.
            extracted = extract_Data(search_result)
            if extracted == 1:
                extraction_errors.append(id)
                continue
            # Get PD frame with: # Link to PDF, Date of Doc, Program, Doc Type

            # Needs to be processed with regex
            site_name = re.sub(r"[()<>:\"/\\|?&#*. ]",  "_", bf_ids["Name"].iloc[ind],)
            site_name = re.sub(r"\n*", "", site_name)

            os.mkdir(f'{city_query}_sites/{site_name}')

            save_PDFs(site_name, extracted)
    finally:
        print("The following IDs returned no results:")
        print(extraction_errors)
        return(f'{city_query}_sites')


In [13]:

docs = kurt_Loop(bf_city)


https://ecm.idem.in.gov/cs/idcplg?IdcService=GET_FILE&dID=80151931&dDocName=80151966&Rendition=web&allowInterrupt=1&noSaveAs=1
https://ecm.idem.in.gov/cs/idcplg?IdcService=GET_FILE&dID=4673592&dDocName=67406543&Rendition=web&allowInterrupt=1&noSaveAs=1
https://ecm.idem.in.gov/cs/idcplg?IdcService=GET_FILE&dID=4040004&dDocName=63940677&Rendition=web&allowInterrupt=1&noSaveAs=1
https://ecm.idem.in.gov/cs/idcplg?IdcService=GET_FILE&dID=3738684&dDocName=56061089&Rendition=web&allowInterrupt=1&noSaveAs=1
https://ecm.idem.in.gov/cs/idcplg?IdcService=GET_FILE&dID=3738683&dDocName=56061084&Rendition=web&allowInterrupt=1&noSaveAs=1
https://ecm.idem.in.gov/cs/idcplg?IdcService=GET_FILE&dID=3795601&dDocName=56061209&Rendition=web&allowInterrupt=1&noSaveAs=1
https://ecm.idem.in.gov/cs/idcplg?IdcService=GET_FILE&dID=2424387&dDocName=14823237&Rendition=web&allowInterrupt=1&noSaveAs=1
https://ecm.idem.in.gov/cs/idcplg?IdcService=GET_FILE&dID=4091006&dDocName=64463489&Rendition=web&allowInterrupt=1&no

In [14]:
def zip_directory(directory_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                if file.endswith('.pdf'):
                    # Create a relative path for files to maintain the directory structure
                    relative_path = os.path.relpath(os.path.join(root, file), os.path.join(directory_path, '..'))
                    zipf.write(os.path.join(root, file), arcname=relative_path)

# Usage
directory_to_zip = docs
output_zip_file = f'{docs}.zip'
zip_directory(directory_to_zip, output_zip_file)

In [None]:
# Assuming you have already created a zip file named 'output.zip'
zip_file = f'{docs}.zip'

# Download the file to the user's local machine
files.download(zip_file)