# Web Scraper for Zillow

### Able to extract information on houses and buildings that are for sale or rent.

# I. Scraper

In [1]:
# data munging 
import pandas as pd

# reading html 
import requests
from bs4 import BeautifulSoup as soup
from collections import Counter
import lxml
import chromedriver_binary

# dealing with dynamic loading of html
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# time each window is open before the next
import time 
from time import sleep 

In [2]:
# html parsing 

def process_zillset(html):
    """
    This function processes the html code from the each zillow url 
    and extracts the items of interest.  
    """
    for home in html:
        home.prettify()
        home.name, home.attrs, home['class']

        #address
        try:
            zaddress = home.find_all('address')
            address = zaddress[0].text.strip()
            zill_dict['address'].append(address)
        except IndexError:
            zill_dict['address'].append('NaN')
            
        
        #listing brokerage and agent 
        try:
            zlisting = home.find_all('p', {'class':'list-card-extra-info'})
            listingby = zlisting[0].text.strip()
            zill_dict['listingby'].append(listingby)
        except IndexError:
            zill_dict['listingby'].append('NaN')
            
        
        #price
        try:
            zprice = home.find_all('div', {'class':'list-card-price'})
            price = zprice[0].text.strip()
            zill_dict['price'].append(price)
        except IndexError:
            zill_dict['price'].append('NaN')
            
        
        #beds
        try:
            zbed = home.find_all('li', {'class':''})
            beds = zbed[0].text.strip()
            zill_dict['beds'].append(beds)
        except IndexError:
            zill_dict['beds'].append('NaN')
            
        
        #bathrooms
        try:
            zbath = home.find_all('li', {'class':''})
            baths = zbath[1].text.strip()
            zill_dict['baths'].append(baths)
        except IndexError:
            zill_dict['baths'].append('NaN')
            
        
        #sqft
        try:
            zsqft = home.find_all('li', {'class':''})
            sqft = zsqft[2].text.strip()
            zill_dict['sqft'].append(sqft)
        except IndexError:
            zill_dict['sqft'].append('NaN')
            
            
        #type
        try:
            ztype = home.find_all('li', {'class':'list-card-statusText'})
            types = ztype[0].text.strip()
            zill_dict['building_type'].append(types)
        except IndexError:
            zill_dict['building_type'].append('NaN')
            
    return zill_dict
    

In [3]:
# url loop

def make_urls(max_page):
    """
    make_urls returns a list of Zillow urls, incrementing the page number until it reaches 
    the max_page specified  
    """
    url_list =[]
    for pageNumber in range(1,max_page+1):
        url = 'https://www.zillow.com/san-francisco-ca/' + str(pageNumber) + '_p?'
        url_list += [url]
    return url_list 

In [4]:
# Specify the max page count and calls the make_urls function to create the urls
# In this case 20 is the maximum amount of results in the zillow website 

max_page = 20
zillow_urls = make_urls(max_page)
print(zillow_urls)

['https://www.zillow.com/san-francisco-ca/1_p?', 'https://www.zillow.com/san-francisco-ca/2_p?', 'https://www.zillow.com/san-francisco-ca/3_p?', 'https://www.zillow.com/san-francisco-ca/4_p?', 'https://www.zillow.com/san-francisco-ca/5_p?', 'https://www.zillow.com/san-francisco-ca/6_p?', 'https://www.zillow.com/san-francisco-ca/7_p?', 'https://www.zillow.com/san-francisco-ca/8_p?', 'https://www.zillow.com/san-francisco-ca/9_p?', 'https://www.zillow.com/san-francisco-ca/10_p?', 'https://www.zillow.com/san-francisco-ca/11_p?', 'https://www.zillow.com/san-francisco-ca/12_p?', 'https://www.zillow.com/san-francisco-ca/13_p?', 'https://www.zillow.com/san-francisco-ca/14_p?', 'https://www.zillow.com/san-francisco-ca/15_p?', 'https://www.zillow.com/san-francisco-ca/16_p?', 'https://www.zillow.com/san-francisco-ca/17_p?', 'https://www.zillow.com/san-francisco-ca/18_p?', 'https://www.zillow.com/san-francisco-ca/19_p?', 'https://www.zillow.com/san-francisco-ca/20_p?']


In [None]:
# using a dictionary to store html items of interest. 

zill_dict = {'address':[], 'listingby':[], 'price':[],'beds':[], 'baths':[],'sqft':[], 'building_type':[], 'price/sqft':[]}


for url in zillow_urls:
    
    #open up each webpage separatly 
    driver = webdriver.Chrome()
    page = driver.get(url)
    time.sleep(20)
    zillhtml= driver.page_source

    #making into soup object
    zill = soup(zillhtml,'lxml') 
    zill_content = zill.find_all('div', attrs ={'class':'list-card-info'})
    
    # prints url after when each one is opened
    print(url)
    
    zill_count = 1
    for zills in zill_content:
        zill_count+=1
    dict=process_zillset(zill_content)
    
    # saving into dataframe
    df = pd.DataFrame.from_dict(dict, orient='index').transpose()

In [5]:
# saving data into a csv for later use

zillow_fs = pd.read_csv("ZillowData.csv", index_col=[0])
zillow_fs.head()

Unnamed: 0,address,listingby,price,beds,baths,sqft,building_type
0,"710 London St, San Francisco, CA 94112","COMPASS, Darin J. Holwitz","$1,498,000",4 bds,3 ba,"2,047 sqft",- House for sale
1,"655 27th St, San Francisco, CA 94131","COMPASS SF, Dale T. Boutiette","$2,195,000",3 bds,3 ba,"1,791 sqft",- House for sale
2,"270 Sadowa St, San Francisco, CA 94112","PACIFIC EDGE, Cynthia M. Pagan","$1,049,000",3 bds,2 ba,"1,264 sqft",- House for sale
3,"437 Valley St, San Francisco, CA 94131","SOTHEBY'S INTERNATIONAL REALTY, Allison Fortin...","$2,000,000",3 bds,2 ba,"1,949 sqft",- House for sale
4,"585 Rockdale Dr, San Francisco, CA 94127","CORCORAN GLOBAL LIVING, Edward O'Connell","$1,295,000",2 bds,1 ba,"1,152 sqft",- House for sale


Looking at our data, we need to separate the information given in the address and listingby column. We also need to strip the labeling in the price, beds, baths, and sqft column.  

# II. Data Cleaning

In [6]:
# removing unwanted labelings, commas, and etc...

zillow_fs['sqft'] = zillow_fs['sqft'].str.rstrip('sqft')
zillow_fs['sqft'] = zillow_fs['sqft'].str.replace(',', '')
zillow_fs['sqft'] = zillow_fs['sqft'].str.replace('--', '0')
zillow_fs['baths']= zillow_fs['baths'].str.rstrip('ba')
zillow_fs['beds'] = zillow_fs['beds'].str.rstrip('bds')
zillow_fs['price'] = zillow_fs['price'].str.replace('$', '')
zillow_fs['price'] = zillow_fs['price'].str.replace(',', '')
zillow_fs['price'] = zillow_fs['price'].str.rstrip('+')

In [7]:
# separating out the address and listing info into muitple parts. 

zillow_fs['zip']= zillow_fs['address'].str.split().str[-1]
zillow_fs['address'] = zillow_fs['address'].str.split(',',expand =True)[0]
zillow_fs['agent'] = zillow_fs['listingby'].str.split(',', expand=True)[1]
zillow_fs['brokerage'] = zillow_fs['listingby'].str.split(',', expand=True)[0]
zillow_fs = zillow_fs.drop(columns =['listingby'])

In [11]:
# dividing price and sqft column to create price per sqft column

zillow_fs=zillow_fs.fillna(0)
zillow_fs[['price', 'sqft']] = zillow_fs[['price', 'sqft']].apply(pd.to_numeric)
zillow_fs['price/sqft'] = zillow_fs['price'].astype(int) / zillow_fs['sqft'].astype(int)

#### Clean Data

In [9]:
zillow_fs = zillow_fs[['address', 'zip', 'building_type', 'price', 'beds', 'baths', 'sqft', 'price/sqft', 'agent', 'brokerage']]
zillow_fs.head()

Unnamed: 0,address,zip,building_type,price,beds,baths,sqft,price/sqft,agent,brokerage
0,710 London St,94112,- House for sale,1498000,4,3,2047,731.802638,Darin J. Holwitz,COMPASS
1,655 27th St,94131,- House for sale,2195000,3,3,1791,1225.572306,Dale T. Boutiette,COMPASS SF
2,270 Sadowa St,94112,- House for sale,1049000,3,2,1264,829.905063,Cynthia M. Pagan,PACIFIC EDGE
3,437 Valley St,94131,- House for sale,2000000,3,2,1949,1026.167265,Allison Fortini Crawford,SOTHEBY'S INTERNATIONAL REALTY
4,585 Rockdale Dr,94127,- House for sale,1295000,2,1,1152,1124.131944,Edward O'Connell,CORCORAN GLOBAL LIVING


In [10]:
# saved into new csv that will be use for later analysis.

zillow_fs.to_csv('for_sale.csv')