# Kane County Tax Assessor Webscraper

## Imports

### Libraries

In [1]:
import requests
import pandas as pd
import os
import time
from bs4 import BeautifulSoup

pd.set_option("display.max_rows", None)

### PIN Data

In [2]:
tax_parcels = pd.read_csv("./kane_county_tax_parcels.csv")

  tax_parcels = pd.read_csv("./kane_county_tax_parcels.csv")


In [3]:
# Filter down for PINs starting with "15-27"
tax_parcels_15_27 = tax_parcels[tax_parcels["PIN"].str.startswith("15-27")]

In [4]:
pins_15_27 = list(tax_parcels_15_27["PIN"])

In [5]:
tax_parcels_15_28 = tax_parcels[tax_parcels["PIN"].str.startswith("15-28")]

In [6]:
pins_15_28 = list(tax_parcels_15_28["PIN"])

In [8]:
len(pins_15_28)

1300

## Duplicative PINs Analysis

In [6]:
tax_parcel_value_counts = tax_parcels["PIN"].value_counts()

In [11]:
duplicative_pins = list(tax_parcel_value_counts[tax_parcel_value_counts > 1].index)

In [15]:
tax_parcels[tax_parcels["PIN"].isin(duplicative_pins)].sort_values("PIN").to_csv("./duplicative_pins.csv", index=False)

In [4]:
[filename for filename in os.listdir("./kane_county_tax_files")]

[]

## Save Kane Tax Assessor Parcel Page to File

In [9]:
base_url = "https://kaneil.devnetwedge.com/parcel/view/"

def get_front_parcel_page(pin):
        
    # Create string for pin to be used in URL
    url_pin = pin.replace("-", "")
    
    url = f"{base_url}{url_pin}"
    no_response = True

    # Request loop for front tax parcel page
    while no_response:
        print(f'Making request at: {url}')
        res = requests.get(url)
        print(f'Request response code: {res.status_code}')
        if res.status_code == 200:
            print(f"Request successful for url: {res.url}")
            no_response = False
        else:
            print(f'Trying again')
            time.sleep(3)
    
    # Directory path for saving parcel page HTML
    directory = f"./kane_county_tax_files/{pin}"
    
    # Check to see if directory exists
    directory_exists = os.path.exists(directory)
    
    # If directory doesn't exist, create directory
    if not directory_exists:
        os.makedirs(directory)
        print(f"created folder : {directory}")
        
    # Pull year from response url
    year = res.url.split("/")[-1]
    print(f"Year pulled from response url: {year}")
    
    with open(f"./kane_county_tax_files/{pin}/{year}.html", "w") as f:
        f.write(res.text)
        print(f"Saved HTML to {f.name}")

def get_front_page_for_multiple_pins(pins_list):
    for pin in pins_list:
        time.sleep(1)
        print("\n")
        print("\n")
        print(f"***** Scraping for PIN: {pin} *****")
        get_front_parcel_page(pin)

In [None]:
get_front_page_for_multiple_pins(pins_15_27)

In [88]:
rootdir = "./kane_county_tax_files"
complete = [it.name.split("/")[-1] for it in os.scandir(rootdir) if it.is_dir()] 

In [89]:
len(complete)

2832

In [90]:
len(pins_15_27)

2831

## Parse HTML

In [16]:
rootdir = "./kane_county_tax_files"
complete = [it.name for it in os.scandir(rootdir) if it.is_dir()] 

In [21]:
joe = [filename for filename in os.listdir("./kane_county_tax_files") if filename != ".ipynb_checkpoints"]

In [23]:
len(joe)

2831

In [174]:
class PinDirectory:
    
    def __init__(self, directory_path):
        self.directory_path = directory_path
        self.files = [filename for filename in os.listdir(directory_path)]
        self.latest_year_filename = sorted(self.files)[-1]
        self.years = [file.split(".")[0] for file in self.files]
        self.latest_year = sorted(self.years)[-1]
        self.latest_parcel_page = None
    
    def has_files(self):
        return len(self.files) > 0
    
    def parse_latest_parcel_page(self):
        # Return ParcelPage object for HTML page for latest year in folder
        
        # Check to see if ParcelPage for latest parcel page was made
        # if not, make it and set it
        if not self.latest_parcel_page:
            self.latest_parcel_page = ParcelPage(f"{self.directory_path}/{self.latest_year_filename}")
        
        return self.latest_parcel_page

    def tax_year_links(self):
        
        # Set latest parcel page if not set already
        parcel_page = self.parse_latest_parcel_page()
        
        return parcel_page.tax_year_links()
    

class ParcelPage:
    
    def __init__(self, filepath):        
        self.filepath = filepath
        self.parsed_html = None
        self.info_panels = None
        
    def parse_html(self):
        # Check to see that HTML for parcel page has been parsed
        if not self.parsed_html:
            # Parse parcel page HTML with Beautiful Soup
            self.parsed_html = BeautifulSoup(open(self.filepath), "html.parser")
        
    def parcel_year_label(self):
        # Pull the parcel and year label from navigation bar
        
        # Check if html is parsed and if not do it
        self.parse_html()
        
        # Pull parcel-year-label from navbar
        return self.parsed_html.select("#parcel-year-label")[0].text

    def parse_info_panels(self):
        # Pull all info panels from page
        
        # Check if html is parsed and if not do it
        self.parse_html()
        
        if not self.info_panels:
            self.info_panels = self.parsed_html.find_all("div", "panel-info")
    
    def property_information_panel(self):
        # Return parsed HTML for info panel with "Property Information" heading
        
        # Check if info panels are parsed
        self.parse_info_panels()
        
        return [panel for panel in self.info_panels if "property information" in panel.find("h3", "panel-title").text.lower()][0]
        
    
    def tax_year_links(self):
        # Pull all tax year links
        
        # Get panel with property information
        prop_info_panel = self.property_information_panel()
        
        # Get table with property information
        prop_info_table = prop_info_panel.find("table")
        
        # Get drop down menu
        dropdown_menu = prop_info_table.find("ul", "dropdown-menu")
        
        return [link['href'] for link in dropdown_menu.find_all("a")]
        
    

In [175]:
directories = [PinDirectory(f"./kane_county_tax_files/{directory}") for directory in joe]

In [176]:
set(list([directory.latest_year for directory in directories]))

{'2017', '2018', '2021', '2022'}

In [177]:
directories[32].has_files()

True

In [180]:
sample_page = directories[32].parse_latest_parcel_page()

In [181]:
sample_page.tax_year_links()

['/parcel/view/1527104009/2022',
 '/parcel/view/1527104009/2021',
 '/parcel/view/1527104009/2020',
 '/parcel/view/1527104009/2019',
 '/parcel/view/1527104009/2018',
 '/parcel/view/1527104009/2017',
 '/parcel/view/1527104009/2016',
 '/parcel/view/1527104009/2015',
 '/parcel/view/1527104009/2014',
 '/parcel/view/1527104009/2013',
 '/parcel/view/1527104009/2012',
 '/parcel/view/1527104009/2011',
 '/parcel/view/1527104009/2010',
 '/parcel/view/1527104009/2009',
 '/parcel/view/1527104009/2008',
 '/parcel/view/1527104009/2007',
 '/parcel/view/1527104009/2006',
 '/parcel/view/1527104009/2005',
 '/parcel/view/1527104009/2004',
 '/parcel/view/1527104009/2003',
 '/parcel/view/1527104009/2002',
 '/parcel/view/1527104009/2001',
 '/parcel/view/1527104009/2000',
 '/parcel/view/1527104009/1999',
 '/parcel/view/1527104009/1992',
 '/parcel/view/1527104009/1988']

In [183]:
[directory.tax_year_links() for directory in directories[1322:1340]]

[['/parcel/view/1527281018/2022',
  '/parcel/view/1527281018/2021',
  '/parcel/view/1527281018/2020',
  '/parcel/view/1527281018/2019',
  '/parcel/view/1527281018/2018',
  '/parcel/view/1527281018/2017',
  '/parcel/view/1527281018/2016',
  '/parcel/view/1527281018/2015',
  '/parcel/view/1527281018/2014',
  '/parcel/view/1527281018/2013',
  '/parcel/view/1527281018/2012',
  '/parcel/view/1527281018/2011',
  '/parcel/view/1527281018/2010',
  '/parcel/view/1527281018/2009',
  '/parcel/view/1527281018/2008',
  '/parcel/view/1527281018/2007',
  '/parcel/view/1527281018/2006',
  '/parcel/view/1527281018/2005',
  '/parcel/view/1527281018/2004',
  '/parcel/view/1527281018/2003',
  '/parcel/view/1527281018/2002',
  '/parcel/view/1527281018/2001',
  '/parcel/view/1527281018/2000',
  '/parcel/view/1527281018/1999',
  '/parcel/view/1527281018/1996',
  '/parcel/view/1527281018/1993',
  '/parcel/view/1527281018/1985'],
 ['/parcel/view/1527281019/2022',
  '/parcel/view/1527281019/2021',
  '/parcel/vi

In [114]:
sample_page.parcel_year_label()

'\n                            15-27-104-009 : 2022\n                        '

In [127]:
[panel for panel in sample_page.panels() if "property information" in panel.find("h3", "panel-title").text.lower()][0]

<div class="panel panel-info">
<div class="panel-heading"><h3 class="panel-title">Property Information</h3></div>
<table class="table table-bordered">
<tr>
<td class="col-xs-4">
<div class="inner-label">Parcel Number</div>
<div class="inner-value">15-27-104-009</div>
</td>
<td class="col-xs-4" rowspan="3">
<div class="inner-label">Site Address</div>
<div class="inner-value">
                        208  S LASALLE ST<br/>
                        AURORA, IL 60505
                    </div>
</td>
<td class="col-xs-4" rowspan="3">
<div class="inner-label">Owner Name &amp; Address</div>
<div class="inner-value" style="white-space:pre-line"> MARTINEZ, HERIBERTO R &amp; SILVA, GUADALUPE A
208 S LASALLE ST APT 1
AURORA, IL, 60505-4689 </div>
</td>
</tr>
<tr>
<td>
<div class="inner-label" style="color:red">Tax Year</div>
<div class="inner-value">
<div style="font-weight:bold; color:red; display:inline;"> 2022 (Payable 2023)</div>
<div class="btn-group">
<button aria-expanded="false" aria-haspop

In [124]:
for panel in sample_page.panels():
    print(panel.find("h3", "panel-title"))

<h3 class="panel-title">Property Information</h3>
<h3 class="panel-title">No Billing Information</h3>
<h3 class="panel-title">Payment History</h3>
<h3 class="panel-title">Assessments</h3>
<h3 class="panel-title">Exemptions</h3>
<h3 class="panel-title">No Taxing Bodies Information</h3>
<h3 class="panel-title">No Redemptions</h3>
<h3 class="panel-title">No Forfeiture Information</h3>
<h3 class="panel-title">No Farmland Information</h3>
<h3 class="panel-title" style="font-family:Helvetica Neue,Helvetica,Arial,sans-serif; display:inline">
                Parcel Map
            </h3>
<h3 class="panel-title">Sales History</h3>


In [96]:
panels = sample_page.parsed_html.find_all("div", "panel-info")

In [97]:
type(panels)

bs4.element.ResultSet

In [98]:
len(panels)

11

In [86]:
directories[32].latest_parcel_page().parcel_year_label()

'\n                            15-27-104-009 : 2022\n                        '

In [54]:
parcel_page_321 = directories[321].latest_parcel_page()

In [55]:
parcel_page_321.parse_html()

In [59]:
parcel_page_321.parsed_html.select("#parcel-year-label")[0].text

'\n                            15-27-134-019 : 2022\n                        '

In [47]:
directories[322].files

['2022.html']

In [36]:
has_files_list = [pin_directory.has_files() for pin_directory in directories]

In [37]:
len(has_files_list)

2831

In [38]:
all(has_files_list)

True

In [39]:
directories[32].files

['2022.html']

In [25]:
soup = BeautifulSoup(open("./kane_county_tax_files/1521132020/2000.html"), "html.parser")

In [28]:
soup.find_all("table")[3]

<table class="table table-bordered">
<thead>
<tr>
<th>Level</th>
<th>Homesite</th>
<th>Dwelling</th>
<th>Farm Land</th>
<th>Farm Building</th>
<th>Mineral</th>
<th>Total</th>
</tr>
</thead>
<tbody>
<tr class="text-right">
<td class="text-center">DOR Equalized</td>
<td>5,110</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>5,110</td>
</tr>
<tr class="text-right">
<td class="text-center">Department of Revenue</td>
<td>5,110</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>5,110</td>
</tr>
<tr class="text-right">
<td class="text-center">Board of Review Equalized</td>
<td>5,110</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>5,110</td>
</tr>
<tr class="text-right">
<td class="text-center">Board of Review</td>
<td>5,110</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>5,110</td>
</tr>
<tr class="text-right">
<td class="text-center">S of A Equalized</td>
<td>5,110</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>0</td>
<td>5,110</td>
</tr>
<tr class="text-right">
<td class="text-ce