In [2]:
import requests
from bs4 import BeautifulSoup as soup
from collections import Counter
import lxml
import pandas as pd
import chromedriver_binary
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time 
from time import sleep 

In [3]:
def process_zillset(html):
    """
    This function processes the zillow html for the differnt page urls and returns the items of interest. 
    """
    for home in html:
        home.prettify()
        home.name, home.attrs, home['class']

        #address
        try:
            zaddress = home.find_all('address')
            address = zaddress[0].text.strip()
            zill_dict['address'].append(address)
        except IndexError:
            zill_dict['address'].append('NaN')
            
        
        #listing brokerage and agent 
        try:
            zlisting = home.find_all('p', {'class':'list-card-extra-info'})
            listingby = zlisting[0].text.strip()
            zill_dict['listingby'].append(listingby)
        except IndexError:
            zill_dict['listingby'].append('NaN')
            
        
        #price
        try:
            zprice = home.find_all('div', {'class':'list-card-price'})
            price = zprice[0].text.strip()
            zill_dict['price'].append(price)
        except IndexError:
            zill_dict['price'].append('NaN')
            
        
        #beds
        try:
            zbed = home.find_all('li', {'class':''})
            beds = zbed[0].text.strip()
            zill_dict['beds'].append(beds)
        except IndexError:
            zill_dict['beds'].append('NaN')
            
        
        #bathrooms
        try:
            zbath = home.find_all('li', {'class':''})
            baths = zbath[1].text.strip()
            zill_dict['baths'].append(baths)
        except IndexError:
            zill_dict['baths'].append('NaN')
            
        
        #sqft
        try:
            zsqft = home.find_all('li', {'class':''})
            sqft = zsqft[2].text.strip()
            zill_dict['sqft'].append(sqft)
        except IndexError:
            zill_dict['sqft'].append('NaN')
            
            
        #type
        try:
            ztype = home.find_all('li', {'class':'list-card-statusText'})
            types = ztype[0].text.strip()
            zill_dict['building_type'].append(types)
        except IndexError:
            zill_dict['building_type'].append('NaN')
            
    return zill_dict
    

In [4]:

def make_urls(max_page):
    """
    make_urls returns a list of Zillow urls, incrementing the page number until it reaches the max_page 
    """
    url_list =[]
    for pageNumber in range(1,max_page+1):
        url = 'https://www.zillow.com/san-francisco-ca/' + str(pageNumber) + '_p?'
        url_list += [url]
    return url_list 

In [5]:
# we give the max_page and call make_urls to make a list of urls that we will use 
max_page = 2
zillow_urls = make_urls(max_page)
print(zillow_urls)

['https://www.zillow.com/san-francisco-ca/1_p?', 'https://www.zillow.com/san-francisco-ca/2_p?']


In [6]:
zill_dict = {'address':[], 'listingby':[], 'price':[],'beds':[], 'baths':[],'sqft':[], 'building_type':[], 'price/sqft':[]}
for url in zillow_urls:

    driver = webdriver.Chrome()
    page = driver.get(url)
    time.sleep(20)
    zillhtml= driver.page_source

    #making into soup object
    zill = soup(zillhtml,'lxml') 
    zill_content = zill.find_all('div', attrs ={'class':'list-card-info'})
    print(url)
    zill_count = 1
    for zills in zill_content:
        zill_count+=1
    dict=process_zillset(zill_content)
    # saving into dataframe
    df = pd.DataFrame.from_dict(dict, orient='index').transpose()

https://www.zillow.com/san-francisco-ca/1_p?
https://www.zillow.com/san-francisco-ca/2_p?


In [7]:
# Data is now read and saved into a csv for later use. 
zillow_fs = pd.read_csv("ZillowData.csv", index_col=[0])
#zillow_fs.shape

The data set must be cleaned and manipulated in order for it to be useable for futre analysis. The data set contains missing and mixed values. 

In [8]:
# removing unwanted parts in column values
zillow_fs['sqft'] = zillow_fs['sqft'].str.rstrip('sqft')
zillow_fs['sqft'] = zillow_fs['sqft'].str.replace(',', '')
zillow_fs['sqft'] = zillow_fs['sqft'].str.replace('--', '0')
zillow_fs['baths']= zillow_fs['baths'].str.rstrip('ba')
zillow_fs['beds'] = zillow_fs['beds'].str.rstrip('bds')
zillow_fs['price'] = zillow_fs['price'].str.replace('$', '')
zillow_fs['price'] = zillow_fs['price'].str.replace(',', '')
zillow_fs['price'] = zillow_fs['price'].str.rstrip('+')

In [9]:
# splitting up string values in columns
zillow_fs['zip']= zillow_fs['address'].str.split().str[-1]
zillow_fs['address'] = zillow_fs['address'].str.split(',',expand =True)[0]
zillow_fs['agent'] = zillow_fs['listingby'].str.split(',', expand=True)[1]
zillow_fs['brokerage'] = zillow_fs['listingby'].str.split(',', expand=True)[0]
zillow_fs = zillow_fs.drop(columns =['listingby'])

In [10]:
# dividing price and sqft column to create price per sqft
zillow_fs=zillow_fs.fillna(0)
zillow_fs[['price', 'sqft']] = zillow_fs[['price', 'sqft']].apply(pd.to_numeric)
zillow_fs['price/sqft'] = zillow_fs['price'].astype(int) / zillow_fs['sqft'].astype(int)

## After cleaning the data

In [11]:
zillow_fs = zillow_fs[['address', 'zip', 'building_type', 'price', 'beds', 'baths', 'sqft', 'price/sqft', 'agent', 'brokerage']]
zillow_fs.head()

Unnamed: 0,address,zip,building_type,price,beds,baths,sqft,price/sqft,agent,brokerage
0,710 London St,94112,- House for sale,1498000,4,3,2047,731.802638,Darin J. Holwitz,COMPASS
1,655 27th St,94131,- House for sale,2195000,3,3,1791,1225.572306,Dale T. Boutiette,COMPASS SF
2,270 Sadowa St,94112,- House for sale,1049000,3,2,1264,829.905063,Cynthia M. Pagan,PACIFIC EDGE
3,437 Valley St,94131,- House for sale,2000000,3,2,1949,1026.167265,Allison Fortini Crawford,SOTHEBY'S INTERNATIONAL REALTY
4,585 Rockdale Dr,94127,- House for sale,1295000,2,1,1152,1124.131944,Edward O'Connell,CORCORAN GLOBAL LIVING


In [219]:
# saved into new csv that will be use for later analysis. 
zillow_fs.to_csv('for_sale.csv')