[info here](https://www5.kingcounty.gov/sdc/FGDCDocs/PARCEL_EXTR_faq.htm)

In [119]:
import time
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

data = pd.read_csv('kc_house_data.csv')
display(data.head(3))

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062


In [166]:
def scrape_assessors(parcel_num):
    ''' Scrapes King County Dept. of Assessments for additional housing data '''
    
    column_names = ['Present Use','Waterfront','Sewer/Septic','Road Access','Parking',
                    'Street Surface','Topography','Traffic Noise','Airport Noise',
                    'Other Nuisances','Water Problems','Transportation Concurrency',
                    'Other Problems','Environmental','Heat Source','Heat System',
                    'Year Renovated','img_src','Power Lines']

    entry = pd.Series(index=column_names)
    
    base_url = 'https://blue.kingcounty.com/Assessor/eRealProperty/'
    parcel_url = 'Detail.aspx?ParcelNbr='
    headers = {'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)',
               'referer' : 'https://blue.kingcounty.com/Assessor/eRealProperty/'} 
    errors = 0
    try:
        r = requests.get(base_url + parcel_url + str(parcel_num),
                         headers=headers, timeout=10)
        soup = BeautifulSoup(r.content, 'html.parser')
        tables = soup.find_all("table",{"class":"GridViewStyle"})

        # street and parking
        entry['Sewer/Septic'] = tables[5].find("td",string="Sewer/Septic").findNext('td').contents[0]
        entry['Road Access'] = tables[5].find("td",string="Road Access").findNext('td').contents[0]
        entry['Parking'] = tables[5].find("td",string="Parking").findNext('td').contents[0]
        entry['Street Surface'] = tables[5].find("td",string="Street Surface").findNext('td').contents[0]

        # nuisances
        entry['Topography'] = tables[9].find("td",string="Topography").findNext('td').contents[0]
        entry['Traffic Noise'] = tables[9].find("td",string="Traffic Noise").findNext('td').contents[0]
        entry['Airport Noise'] = tables[9].find("td",string="Airport Noise").findNext('td').contents[0]
        entry['Power Lines'] = tables[9].find("td",string="Power Lines").findNext('td').contents[0]
        entry['Other Nuisances'] = tables[9].find("td",string="Other Nuisances").findNext('td').contents[0]

        # problems
        entry['Water Problems'] = tables[10].find("td",string="Water Problems").findNext('td').contents[0]
        entry['Transportation Concurrency'] = tables[10].find("td",string="Transportation Concurrency").findNext('td').contents[0]
        entry['Other Problems'] = tables[10].find("td",string="Other Problems").findNext('td').contents[0]

        # assorted
        entry['Present Use'] = tables[4].find("td",string="Present Use").findNext('td').contents[0]
        entry['Environmental'] = tables[11].find("td",string="Environmental").findNext('td').contents[0]
        entry['Heat Source'] = tables[12].find("td",string="Heat Source").findNext('td').contents[0]
        entry['Heat System'] = tables[12].find("td",string="Heat System").findNext('td').contents[0]
        entry['img_src'] = base_url + soup.find('table',{'id':'cphContent_FormViewResBldgPict'}).findNext('img')['src']

        # for the nans
        entry['Waterfront'] = int(tables[7].find("td",string="Waterfront Footage").findNext('td').contents[0]) > 0
        entry['Year Renovated'] = tables[12].find("td",string="Year Renovated").findNext('td').contents[0]
    except Exception as e:
        print(e,end='\r')
        # rather than put the try block in the parent function, a failed read
        # can still return an empty entry. Than it can't skip columns accidently.
        errors = 1
        
    return entry,errors

In [167]:
from PIL import Image
from io import BytesIO

def scrape_image(src):
    headers = {'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)',
               'referer' : 'https://blue.kingcounty.com/Assessor/eRealProperty/'}
    try:
        r = requests.get(base_url + src, headers=headers, timeout=10)
        img = np.expand_dims(Image.open(BytesIO(r.content)).resize((400,300)),axis=0)
    except Exception as e:
        print(e,end='\r')
        img = np.zeros((1,300,400,3))
    return img

In [168]:
from tqdm.notebook import tqdm

def batch_scrape(df):
    column_names = ['Present Use','Waterfront','Sewer/Septic','Road Access','Parking',
                    'Street Surface','Topography','Traffic Noise','Airport Noise',
                    'Other Nuisances','Water Problems','Transportation Concurrency',
                    'Other Problems','Environmental','Heat Source','Heat System',
                    'Year Renovated','img_src','Power Lines']
    new_panda = pd.DataFrame(columns=column_names)
    image_stack = np.empty((len(df),300,400,3))
    errors = []
    for i,parcel_num in tqdm(enumerate(df['id']),total=len(df)):
        result,err = scrape_assessors(parcel_num)
        errors.append(err)
        new_panda = new_panda.append(result,ignore_index=True)
        time.sleep(1) # be nice
        image_stack[i] = scrape_image(result['img_src'])
        time.sleep(1)
        if err:
            print("Errors: {}".format(sum(errors)),end='\r')
    return new_panda,image_stack
             
data = pd.read_csv('kc_house_data.csv')
new_data, image_stack = batch_scrape(data[-5:])

print(image_stack.shape)
new_data.head()
        

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

cannot identify image file <_io.BytesIO object at 0x000000979D2077D8>
(5, 300, 400, 3)


Unnamed: 0,Present Use,Waterfront,Sewer/Septic,Road Access,Parking,Street Surface,Topography,Traffic Noise,Airport Noise,Other Nuisances,Water Problems,Transportation Concurrency,Other Problems,Environmental,Heat Source,Heat System,Year Renovated,img_src,Power Lines
0,,,,,,,,,,,,,,,,,,,
1,Single Family(Res Use/Zone),False,PUBLIC,PUBLIC,ADEQUATE,PAVED,,,,NO,NO,NO,NO,NO,Gas,Forced Air,0.0,https://blue.kingcounty.com/Assessor/eRealProp...,NO
2,Townhouse Plat,False,PUBLIC,PUBLIC,ADEQUATE,PAVED,,,,NO,NO,NO,NO,NO,Gas,Forced Air,0.0,https://blue.kingcounty.com/Assessor/eRealProp...,NO
3,,,,,,,,,,,,,,,,,,,
4,Townhouse Plat,False,PUBLIC,PUBLIC,ADEQUATE,PAVED,,,,NO,NO,NO,NO,NO,Gas,Forced Air,0.0,https://blue.kingcounty.com/Assessor/eRealProp...,NO


In [154]:
print('hi',end='\r')
print('ho')

hiho
