# Kane County Tax Assessor Webscraper

## Imports

### Libraries

In [1]:
import requests
import pandas as pd
import os
import time
from bs4 import BeautifulSoup

pd.set_option("display.max_rows", None)

### PIN Data

In [2]:
tax_parcels = pd.read_csv("./kane_county_tax_parcels.csv")

  tax_parcels = pd.read_csv("./kane_county_tax_parcels.csv")


In [3]:
# Filter down for PINs starting with "15-27"
tax_parcels_15_27 = tax_parcels[tax_parcels["PIN"].str.startswith("15-27")]

In [4]:
pins_15_27 = list(tax_parcels_15_27["PIN"])

In [5]:
tax_parcels_15_28 = tax_parcels[tax_parcels["PIN"].str.startswith("15-28")]

In [6]:
pins_15_28 = list(tax_parcels_15_28["PIN"])

In [8]:
len(pins_15_28)

1300

## Duplicative PINs Analysis

In [6]:
tax_parcel_value_counts = tax_parcels["PIN"].value_counts()

In [11]:
duplicative_pins = list(tax_parcel_value_counts[tax_parcel_value_counts > 1].index)

In [15]:
tax_parcels[tax_parcels["PIN"].isin(duplicative_pins)].sort_values("PIN").to_csv("./duplicative_pins.csv", index=False)

In [4]:
[filename for filename in os.listdir("./kane_county_tax_files")]

[]

## Save Kane Tax Assessor Parcel Page to File

In [9]:
base_url = "https://kaneil.devnetwedge.com/parcel/view/"

def get_front_parcel_page(pin):
        
    # Create string for pin to be used in URL
    url_pin = pin.replace("-", "")
    
    url = f"{base_url}{url_pin}"
    no_response = True

    # Request loop for front tax parcel page
    while no_response:
        print(f'Making request at: {url}')
        res = requests.get(url)
        print(f'Request response code: {res.status_code}')
        if res.status_code == 200:
            print(f"Request successful for url: {res.url}")
            no_response = False
        else:
            print(f'Trying again')
            time.sleep(3)
    
    # Directory path for saving parcel page HTML
    directory = f"./kane_county_tax_files/{pin}"
    
    # Check to see if directory exists
    directory_exists = os.path.exists(directory)
    
    # If directory doesn't exist, create directory
    if not directory_exists:
        os.makedirs(directory)
        print(f"created folder : {directory}")
        
    # Pull year from response url
    year = res.url.split("/")[-1]
    print(f"Year pulled from response url: {year}")
    
    with open(f"./kane_county_tax_files/{pin}/{year}.html", "w") as f:
        f.write(res.text)
        print(f"Saved HTML to {f.name}")

def get_front_page_for_multiple_pins(pins_list):
    for pin in pins_list:
        time.sleep(1)
        print("\n")
        print("\n")
        print(f"***** Scraping for PIN: {pin} *****")
        get_front_parcel_page(pin)

In [None]:
get_front_page_for_multiple_pins(pins_15_27)

In [88]:
rootdir = "./kane_county_tax_files"
complete = [it.name.split("/")[-1] for it in os.scandir(rootdir) if it.is_dir()] 

In [89]:
len(complete)

2832

In [90]:
len(pins_15_27)

2831

## Parse HTML

In [16]:
rootdir = "./kane_county_tax_files"
complete = [it.name for it in os.scandir(rootdir) if it.is_dir()] 

In [21]:
joe = [filename for filename in os.listdir("./kane_county_tax_files") if filename != ".ipynb_checkpoints"]

In [23]:
len(joe)

2831

In [174]:
class PinDirectory:
    
    def __init__(self, directory_path):
        self.directory_path = directory_path
        self.files = [filename for filename in os.listdir(directory_path)]
        self.latest_year_filename = sorted(self.files)[-1]
        self.years = [file.split(".")[0] for file in self.files]
        self.latest_year = sorted(self.years)[-1]
        self.latest_parcel_page = None
    
    def has_files(self):
        return len(self.files) > 0
    
    def parse_latest_parcel_page(self):
        # Return ParcelPage object for HTML page for latest year in folder
        
        # Check to see if ParcelPage for latest parcel page was made
        # if not, make it and set it
        if not self.latest_parcel_page:
            self.latest_parcel_page = ParcelPage(f"{self.directory_path}/{self.latest_year_filename}")
        
        return self.latest_parcel_page

    def tax_year_links(self):
        
        # Set latest parcel page if not set already
        parcel_page = self.parse_latest_parcel_page()
        
        return parcel_page.tax_year_links()
    
    def download_tax_page(self, link):
        base_url = "https://kaneil.devnetwedge.com"

        url = f"{base_url}{link}"
        no_response = True

        # Request loop for tax parcel page
        while no_response:
            print(f'Making request at: {url}')
            res = requests.get(url)
            print(f'Request response code: {res.status_code}')
            if res.status_code == 200:
                print(f"Request successful for url: {res.url}")
                no_response = False
            else:
                print(f'Trying again')
                time.sleep(3)

        # Directory path for saving parcel page HTML
        directory = self.directory_path

        # Pull year from response url
        year = url.split("/")[-1]
        print(f"Year pulled from url: {year}")

        with open(f"{directory}/{year}.html", "w") as f:
            f.write(res.text)
            print(f"Saved HTML to {f.name}")

    
    def download_all_tax_pages(self):
        links = self.tax_year_links()
        for link in links:
            time.sleep(1)
            print("\n")
            print(f"***** Scraping {link} *****")
            self.download_tax_page(link)
            
class ParcelPage:
    
    def __init__(self, filepath):        
        self.filepath = filepath
        self.parsed_html = None
        self.info_panels = None
        
    def parse_html(self):
        # Check to see that HTML for parcel page has been parsed
        if not self.parsed_html:
            # Parse parcel page HTML with Beautiful Soup
            self.parsed_html = BeautifulSoup(open(self.filepath), "html.parser")
        
    def parcel_year_label(self):
        # Pull the parcel and year label from navigation bar
        
        # Check if html is parsed and if not do it
        self.parse_html()
        
        # Pull parcel-year-label from navbar
        return self.parsed_html.select("#parcel-year-label")[0].text

    def parse_info_panels(self):
        # Pull all info panels from page
        
        # Check if html is parsed and if not do it
        self.parse_html()
        
        if not self.info_panels:
            self.info_panels = self.parsed_html.find_all("div", "panel-info")
    
    def property_information_panel(self):
        # Return parsed HTML for info panel with "Property Information" heading
        
        # Check if info panels are parsed
        self.parse_info_panels()
        
        return [panel for panel in self.info_panels if "property information" in panel.find("h3", "panel-title").text.lower()][0]
        
    
    def tax_year_links(self):
        # Pull all tax year links
        
        # Get panel with property information
        prop_info_panel = self.property_information_panel()
        
        # Get table with property information
        prop_info_table = prop_info_panel.find("table")
        
        # Get drop down menu
        dropdown_menu = prop_info_table.find("ul", "dropdown-menu")
        
        return [link['href'] for link in dropdown_menu.find_all("a")]
        
    

In [175]:
directories = [PinDirectory(f"./kane_county_tax_files/{directory}") for directory in joe]

In [176]:
set(list([directory.latest_year for directory in directories]))

{'2017', '2018', '2021', '2022'}