In [1]:
import pandas as pd
import numpy as np
import time
import random
import os
import re
from io import StringIO

from deepparse import download_from_public_repository
from deepparse.dataset_container import PickleDatasetContainer
from deepparse.parser import AddressParser

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile

# Import the exception class
from selenium.common.exceptions import NoSuchElementException,ElementNotInteractableException  
from selenium.webdriver.common.keys import Keys
import json

In [2]:
options = Options()
firefox_profile = FirefoxProfile()
firefox_profile.set_preference("javascript.enabled", True)
options.profile = firefox_profile
driver = webdriver.Firefox(options=options)

def get_free_proxies(driver):
    driver.get('https://sslproxies.org')

    table = driver.find_element(By.TAG_NAME, 'table')
    thead = table.find_element(By.TAG_NAME, 'thead').find_elements(By.TAG_NAME, 'th')
    tbody = table.find_element(By.TAG_NAME, 'tbody').find_elements(By.TAG_NAME, 'tr')

    headers = []
    for th in thead:
        headers.append(th.text.strip())

    proxies = []
    for tr in tbody:
        proxy_data = {}
        tds = tr.find_elements(By.TAG_NAME, 'td')
        for i in range(len(headers)):
            proxy_data[headers[i]] = tds[i].text.strip()
        proxies.append(proxy_data)
    driver.quit()
    return proxies


free_proxies = get_free_proxies(driver)

In [6]:
#URL of webscraping
BASE_URL = 'https://wedge.hcauditor.org/'

my_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0'

# Year constraints
NUM_YEARS = 15
YEAR_START = 2009

# The radio button to select to search by sales. 
SALES_XPATH = '//*[@id="search_radio_sales"]'

# Form values to filter search.
query_values = ['375000', '100000', '1300', '800', '2']
query_ids = ['sale_date_high', 'sale_date_low', 'sale_price_high', 'sale_price_low', 'finished_sq_ft_high', 'finished_sq_ft_low', 'bedrooms_low']
CONVENTIONAL_XPATH = '//*[@id="ms-cama_style"]/div[1]/ul/li[1]'
AC_XPATH = '//*[@id="ac_yes"]'
SCHOOL_DISTRICT_XPATHS = [
    '//*[@id="ms-adv-school-dist"]/div[1]/ul/li[7]',
    '//*[@id="ms-adv-school-dist"]/div[1]/ul/li[8]',
    '//*[@id="ms-adv-school-dist"]/div[1]/ul/li[9]',
    '//*[@id="ms-adv-school-dist"]/div[1]/ul/li[10]',
    '//*[@id="ms-adv-school-dist"]/div[1]/ul/li[20]',
    '//*[@id="ms-adv-school-dist"]/div[1]/ul/li[23]',
]
# Form search button
SEARCH_BUTTON_XPATH = '//*[@id="sales-criteria"]/div[3]/button[1]'

# Search result tables XPATHs
RESULTS_TABLE_XPATH = '//*[@id="search-results"]'
NEXT_PAGE_XPATH = '//*[@id="search-results_next"]'
SEARCH_RESULTS_XPATH = '//*[@id="search-results_info"]'

# Search result tables "first page" XPATH button
FIRST_PAGE_XPATH = '//*[@id="search-results_paginate"]/span/a[1]'

# First row of table XPATH
HOUSE_TABLE_ROW1_XPATH = '//*[@id="search-results"]/tbody/tr[1]'

# Property summary individual cells XPATH
PARCEL_ID_XPATH = '//*[@id="parcel-header-info"]/div[1]'
APPRAISAL_AREA_XPATH = '//*[@id="property_information"]/tbody/tr[2]/td[1]/div[2]'
SCHOOL_DISTRICT_XPATH = '//*[@id="property_information"]/tbody/tr[1]/td[1]/div[4]'
TAX_RATE_XPATH = '//*[@id="property_information"]/tbody/tr[4]/td[2]/div[2]'
ANNUAL_TAX_XPATH = '//*[@id="property_information"]/tbody/tr[4]/td[3]/div[2]'

PROPERTY_SUMMARY_XPATH = '//*[@id="parcel-tabs"]/a[1]'
# Property summary tables XPATH
APPRAISAL_TABLE_XPATH = '//*[@id="property_overview_wrapper"]/table[1]'
TAX_TABLE_XPATH = '//*[@id="tax-credit-value-summary"]'

# On the property summary page, this is the Next button XPATH
NEXT_PROPERTY_XPATH = '//*[@id="results-nav"]/a[3]'

# This is the new search XPATH
NEW_SEARCH_XPATH = '//*[@id="sidebar"]/div[2]/a[1]'
    
# Initialize web driver
def init_driver():
    options = Options()
    firefox_profile = FirefoxProfile()
    firefox_profile.set_preference("javascript.enabled", True)
    options.profile = firefox_profile
    RAND_INT = random.randint(0,len(free_proxies))
    proxy_server_url = free_proxies[RAND_INT]['IP Address']
    options.add_argument(f'--proxy-server={proxy_server_url}')
    options.add_argument(f"--user-agent={my_user_agent}")
    driver = webdriver.Firefox(options=options)
    wait = WebDriverWait(driver, 10)
    driver.get(BASE_URL)
    return driver, wait

# Helper functions
def click_element(wait,xpath):
    wait.until(EC.element_to_be_clickable((By.XPATH, xpath))).click()

def fill_form(driver,wait,ids, values):
    for id, value in zip(ids, values):
        wait.until(EC.presence_of_element_located((By.ID, id)))
        driver.find_element(By.ID, id).send_keys(value)

def get_text(wait,xpath):
    text = wait.until(EC.presence_of_element_located((By.XPATH, xpath))).text
    return text
    
def scrape_table(wait,xpath):
    html = wait.until(EC.visibility_of_element_located((By.XPATH, xpath))).get_attribute("outerHTML")
    html_io = StringIO(html)
    return pd.read_html(html_io)[0]

def find_click_row(driver,wait,xpath):
    row = driver.find_element(By.XPATH,xpath)
    driver.execute_script("arguments[0].scrollIntoView(true);", row)
    wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
    try:
        row.click()
    except ElementNotInteractableException:
        driver.execute_script("arguments[0].click();", row)

def transform_appraisal_table(table):
    table = table.transpose().reset_index(drop=True)
    table.columns = table.iloc[0]
    return (table.drop(0)
                .drop(['Year Built', 'Deed Number'], axis=1)
                .rename(columns={'# Bedrooms': 'Bedrooms',
                                 '# Full Bathrooms': 'Full Baths',
                                 '# Half Bathrooms': 'Half Baths'}))
    
def transform_tax_table(table):
    table = table.transpose().reset_index(drop=True)
    table.columns = table.iloc[0]
    return table.drop(0)

#Main scraping logic
def main():
    driver, wait = init_driver()
    all_data = []
    appraisal_data = []
    tax_data = []

    for year in range(YEAR_START, YEAR_START + NUM_YEARS):
        click_element(wait,SALES_XPATH)  # Example XPath, replace with actual
    # Fill out the form
        fill_form(driver,wait,query_ids, [f'08/31/{year}',f'02/01/{year}'] + query_values)
    
        # Click conventional style
        click_element(wait,CONVENTIONAL_XPATH)

        # Click ac checklist
        click_element(wait,AC_XPATH)
    
        # Click on specific school districts
        for xpath in SCHOOL_DISTRICT_XPATHS:
            click_element(wait,xpath)
    
        # Click on the search button
        click_element(wait,SEARCH_BUTTON_XPATH)
        time.sleep(1)
        NUM_ENTRIES = pd.to_numeric(get_text(wait,SEARCH_RESULTS_XPATH).split(' ')[5])
        while True:
            # Scrape data from the current page
            table = scrape_table(wait,RESULTS_TABLE_XPATH)
            all_data.append(table)
        
            # Check if the 'Next' button exists and is clickable
            try:
                next_button = driver.find_element(By.XPATH, NEXT_PAGE_XPATH)
                if "disabled" in next_button.get_attribute('class'):
                    break  # 'Next' button is disabled, so break the loop
                else:
                    next_button.click()  # Click the 'Next' button
                    # Wait for the next page to load and then scrape the table
                    # (Assuming some wait condition is needed here)
                    table = scrape_table(wait,RESULTS_TABLE_XPATH)
                    all_data.append(table)
                    # WAIT_TIME = random.uniform(30,60)
                    # time.sleep(WAIT_TIME)
        
            except NoSuchElementException:
                # 'Next' button doesn't exist, so break the loop
                break
        CURRENT_PAGE = 0
        #Goes to the first search page        
        click_element(wait,FIRST_PAGE_XPATH)
    
        #Grabs info from the first row in the table and 
        #then scrolls through all properties grabbing the same info.
        find_click_row(driver,wait,HOUSE_TABLE_ROW1_XPATH)
        while CURRENT_PAGE < NUM_ENTRIES:
            appraisal_table = scrape_table(wait,APPRAISAL_TABLE_XPATH)
            # Scrape data from the current page
            appraisal_table = transform_appraisal_table(appraisal_table)
            appraisal_table['annual_tax'] = get_text(wait,ANNUAL_TAX_XPATH)
            appraisal_table['effective_tax_rate'] = get_text(wait,TAX_RATE_XPATH)
            appraisal_table['parcel_id'] = get_text(wait,PARCEL_ID_XPATH).split('\n')[1]
            appraisal_table['school_district'] = get_text(wait,SCHOOL_DISTRICT_XPATH)
            appraisal_table['appraisal_area'] = get_text(wait,APPRAISAL_AREA_XPATH)
            appraisal_data.append(appraisal_table)

            tax_table = transform_tax_table(scrape_table(wait,TAX_TABLE_XPATH))
            tax_table['parcel_id'] = get_text(wait,PARCEL_ID_XPATH).split('\n')[1]
            tax_data.append(tax_table)
                        
            driver.find_element(By.XPATH, NEXT_PROPERTY_XPATH).click()
            WAIT_TIME = random.uniform(5,15)
            time.sleep(WAIT_TIME)
            CURRENT_PAGE += 1           
            
        driver.find_element(By.XPATH, NEW_SEARCH_XPATH).click()

    all_data_df = (pd.concat(all_data,axis=0)
                        .reset_index()
                        .drop('index',axis=1))
    appraisal_data_df = (pd.concat(appraisal_data,axis=0)
                        .reset_index()
                        .drop('index',axis=1))
    tax_data_df = (pd.concat(tax_data,axis=0)
                        .reset_index()
                        .drop('index',axis=1))
    all_data_df = (all_data_df.merge(appraisal_data_df, left_on = 'Parcel Number', right_on = 'parcel_id', how = 'left')
                   .merge(tax_data_df, left_on = 'Parcel Number', right_on='parcel_id', how = 'left'))
    driver.quit()
    return all_data_df, all_data, appraisal_data, tax_data

In [7]:
if __name__ == "__main__":
    try:
        final_df, final_data, apprasial_data, tax_data = main()
        # Save or process final_data as needed
    except Exception as e:
        print(f"An error occurred: {e}")

An error occurred: list index out of range


In [5]:
# Function to format column names
def format_column_name(name):
    # Replace spaces with underscores
    name = name.replace(' ', '_')
    # Remove special characters
    name = re.sub(r'[^A-Za-z0-9_]+', '', name)
    # Convert to lowercase (optional)
    name = name.lower()
    return name

# Format each column name
formatted_column_names = [format_column_name(col) for col in final_df.columns]

# Assign the formatted names back to the DataFrame
final_df.columns = formatted_column_names


NameError: name 'final_df' is not defined

In [6]:
final_df = final_df.drop([
                'last_transfer_date',
               'last_sale_amount',
               'parcel_id_x',
               'parcel_id_y',
               'bbb'], axis = 1)

In [7]:
final_df = final_df.drop_duplicates()

In [8]:
final_df = final_df.rename(columns={
    '_of_parcels_sold': 'num_parcels_sold'
})

In [9]:
#510 - Single Family, 550 - Condominiums, 555 - PUD (Landominium), 680 - CHARITIES, HOSPITALS & RETIREMENT HOMES
final_df.use.unique()

array([510, 550], dtype=int64)

In [10]:
final_df = final_df[final_df.use!=680]

In [None]:
#final_df.to_excel('House Sales 2009 to 2023 Hamilton County.xlsx')