In [29]:
import os
from bs4 import BeautifulSoup

In [30]:
class PinDirectory:
    
    def __init__(self, directory_path):
        self.directory_path = directory_path
        self.files = [filename for filename in os.listdir(directory_path)]
        self.latest_year_filename = sorted(self.files)[-1]
        self.years = [file.split(".")[0] for file in self.files]
        self.latest_year = sorted(self.years)[-1]
        self.latest_parcel_page = None
    
    def has_files(self):
        return len(self.files) > 0
    
    def has_all_files(self):
        tax_years = [link.split("/")[-1] for link in self.tax_year_links()]
        return set(self.years) == set(tax_years)
    
    def parse_latest_parcel_page(self):
        # Return ParcelPage object for HTML page for latest year in folder
        
        # Check to see if ParcelPage for latest parcel page was made
        # if not, make it and set it
        if not self.latest_parcel_page:
            self.latest_parcel_page = ParcelPage(f"{self.directory_path}/{self.latest_year_filename}")
        
        return self.latest_parcel_page

    def tax_year_links(self):
        
        # Set latest parcel page if not set already
        parcel_page = self.parse_latest_parcel_page()
        
        return parcel_page.tax_year_links()
    
    def download_tax_page(self, link):
        base_url = "https://kaneil.devnetwedge.com"

        url = f"{base_url}{link}"
        no_response = True

        # Request loop for tax parcel page
        while no_response:
            print(f'Making request at: {url}')
            res = requests.get(url)
            print(f'Request response code: {res.status_code}')
            if res.status_code == 200:
                print(f"Request successful for url: {res.url}")
                no_response = False
            else:
                print(f'Trying again')
                time.sleep(3)

        # Directory path for saving parcel page HTML
        directory = self.directory_path

        # Pull year from response url
        year = url.split("/")[-1]
        print(f"Year pulled from url: {year}")

        with open(f"{directory}/{year}.html", "w") as f:
            f.write(res.text)
            print(f"Saved HTML to {f.name}")

    
    def download_all_tax_pages(self):
        links = self.tax_year_links()
        for link in links:
            time.sleep(1)
            print("\n")
            print(f"***** Scraping {link} *****")
            self.download_tax_page(link)
            
class ParcelPage:
    
    def __init__(self, filepath):        
        self.filepath = filepath
        self.parsed_html = None
        self.info_panels = None
        
    def parse_html(self):
        # Check to see that HTML for parcel page has been parsed
        if not self.parsed_html:
            # Parse parcel page HTML with Beautiful Soup
            self.parsed_html = BeautifulSoup(open(self.filepath), "html.parser")
        
    def parcel_year_label(self):
        # Pull the parcel and year label from navigation bar
        
        # Check if html is parsed and if not do it
        self.parse_html()
        
        # Pull parcel-year-label from navbar
        return self.parsed_html.select("#parcel-year-label")[0].text

    def parse_info_panels(self):
        # Pull all info panels from page
        
        # Check if html is parsed and if not do it
        self.parse_html()
        
        if not self.info_panels:
            self.info_panels = self.parsed_html.find_all("div", "panel-info")
    
    def property_information_panel(self):
        # Return parsed HTML for info panel with "Property Information" heading
        
        # Check if info panels are parsed
        self.parse_info_panels()
        
        return [panel for panel in self.info_panels if "property information" in panel.find("h3", "panel-title").text.lower()][0]
        
    
    def tax_year_links(self):
        # Pull all tax year links
        
        # Get panel with property information
        prop_info_panel = self.property_information_panel()
        
        # Get table with property information
        prop_info_table = prop_info_panel.find("table")
        
        # Get drop down menu
        dropdown_menu = prop_info_table.find("ul", "dropdown-menu")
        
        return [link['href'] for link in dropdown_menu.find_all("a")]
        

In [31]:
joe = [filename for filename in os.listdir("./kane_county_tax_files") if filename != ".ipynb_checkpoints"]

directories = [PinDirectory(f"./kane_county_tax_files/{directory}") for directory in joe]

In [32]:
len(directories)

2831

In [33]:
directories[3].has_all_files()

True

In [46]:
complete = [d for d in directories if d.has_all_files()]

In [47]:
len(complete)

1065

In [45]:
2831 - 1766

1065

In [42]:
len(directories)

2831

In [28]:
[d.directory_path for d in directories[0:100]]

['./kane_county_tax_files/15-27-101-001',
 './kane_county_tax_files/15-27-102-001',
 './kane_county_tax_files/15-27-102-003',
 './kane_county_tax_files/15-27-102-004',
 './kane_county_tax_files/15-27-102-006',
 './kane_county_tax_files/15-27-102-007',
 './kane_county_tax_files/15-27-103-001',
 './kane_county_tax_files/15-27-103-002',
 './kane_county_tax_files/15-27-103-003',
 './kane_county_tax_files/15-27-103-004',
 './kane_county_tax_files/15-27-103-005',
 './kane_county_tax_files/15-27-103-006',
 './kane_county_tax_files/15-27-103-007',
 './kane_county_tax_files/15-27-103-008',
 './kane_county_tax_files/15-27-103-009',
 './kane_county_tax_files/15-27-103-010',
 './kane_county_tax_files/15-27-103-011',
 './kane_county_tax_files/15-27-103-012',
 './kane_county_tax_files/15-27-103-013',
 './kane_county_tax_files/15-27-103-014',
 './kane_county_tax_files/15-27-103-015',
 './kane_county_tax_files/15-27-103-016',
 './kane_county_tax_files/15-27-103-017',
 './kane_county_tax_files/15-27-10