In [9]:
from lxml import html
import requests
import unicodecsv as csv
import argparse
import json

ModuleNotFoundError: ignored

In [10]:
!pip install beautifulsoup4



In [11]:
!pip install selenium



In [3]:
def clean(text):
    if text:
        return ' '.join(' '.join(text).split())
    return None


def get_headers():
    # Creating headers.
    headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
               'accept-encoding': 'gzip, deflate, sdch, br',
               'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
               'cache-control': 'max-age=0',
               'upgrade-insecure-requests': '1',
               'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    return headers


def create_url(zipcode, filter):
    # Creating Zillow URL based on the filter.

    if filter == "newest":
        url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
    elif filter == "cheapest":
        url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
    else:
        url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
    print(url)
    return url


def save_to_file(response):
    # saving response to `response.html`

    with open("response.html", 'w', encoding="utf-8") as fp:
        fp.write(response.text)


def write_data_to_csv(data):
    # saving scraped data to csv.

    with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
        fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)


def get_response(url):
    # Getting response from zillow.com.

    for i in range(5):
        response = requests.get(url, headers=get_headers())
        print("status code received:", response.status_code)
        if response.status_code != 200:
            # saving response to file for debugging purpose.
            save_to_file(response)
            continue
        else:
            save_to_file(response)
            return response
    return None

def get_data_from_json(raw_json_data):
    # getting data from json (type 2 of their A/B testing page)

    cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
    properties_list = []

    try:
        json_data = json.loads(cleaned_data)
        search_results = json_data.get('searchResults').get('listResults', [])

        for properties in search_results:
            address = properties.get('addressWithZip')
            property_info = properties.get('hdpData', {}).get('homeInfo')
            city = property_info.get('city')
            state = property_info.get('state')
            postal_code = property_info.get('zipcode')
            price = properties.get('price')
            bedrooms = properties.get('beds')
            bathrooms = properties.get('baths')
            area = properties.get('area')
            info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft'
            broker = properties.get('brokerName')
            property_url = properties.get('detailUrl')
            title = properties.get('statusText')

            data = {'address': address,
                    'city': city,
                    'state': state,
                    'postal_code': postal_code,
                    'price': price,
                    'facts and features': info,
                    'real estate provider': broker,
                    'url': property_url,
                    'title': title}
            properties_list.append(data)

        return properties_list

    except ValueError:
        print("Invalid json")
        return None


def parse(zipcode, filter=None):
    url = create_url(zipcode, filter)
    response = get_response(url)

    if not response:
        print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
        return None

    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")

    if not search_results:
        print("parsing from json data")
        # identified as type 2 page
        raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
        return get_data_from_json(raw_json_data)

    print("parsing from html page")
    properties_list = []
    for properties in search_results:
        raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
        raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
        raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
        raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
        raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
        raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
        raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
        url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
        raw_title = properties.xpath(".//h4//text()")

        address = clean(raw_address)
        city = clean(raw_city)
        state = clean(raw_state)
        postal_code = clean(raw_postal_code)
        price = clean(raw_price)
        info = clean(raw_info).replace(u"\xb7", ',')
        broker = clean(raw_broker_name)
        title = clean(raw_title)
        property_url = "https://www.zillow.com" + url[0] if url else None
        is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')

        properties = {'address': address,
                      'city': city,
                      'state': state,
                      'postal_code': postal_code,
                      'price': price,
                      'facts and features': info,
                      'real estate provider': broker,
                      'url': property_url,
                      'title': title}
        if is_forsale:
            properties_list.append(properties)
    return properties_list

In [None]:
zipcode = '10040'
sort = 'newest'
print ("Fetching data for %s" % (zipcode))
scraped_data = parse(zipcode, sort)
if scraped_data:
    print("Writing data to output file")
    write_data_to_csv(scraped_data)

Fetching data for 10040
https://www.zillow.com/homes/for_sale/10040/0_singlestory/days_sort
status code received: 200
parsing from json data


AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
def parse(zipcode,filter=None):
    if filter=="newest":
        url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
    elif filter == "cheapest":
        url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
    else:
        url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)

    for i in range(5):
        # try:
        headers= {
                'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'accept-encoding':'gzip, deflate, sdch, br',
                'accept-language':'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
                'cache-control':'max-age=0',
                'upgrade-insecure-requests':'1',
                'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    print(response.status_code)
    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")
    properties_list = []
		
    for properties in search_results:
        raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
        raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
        raw_state= properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
        raw_postal_code= properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
        raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
        raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
        raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
        url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
        raw_title = properties.xpath(".//h4//text()")

        address = ' '.join(' '.join(raw_address).split()) if raw_address else None
        city = ''.join(raw_city).strip() if raw_city else None
        state = ''.join(raw_state).strip() if raw_state else None
        postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
        price = ''.join(raw_price).strip() if raw_price else None
        info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7",',')
        broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
        title = ''.join(raw_title) if raw_title else None
        property_url = "https://www.zillow.com"+url[0] if url else None 
        is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
        properties = {
                        'address':address,
                        'city':city,
                        'state':state,
                        'postal_code':postal_code,
                        'price':price,
                        'facts and features':info,
                        'real estate provider':broker,
                        'url':property_url,
                        'title':title
        }
        if is_forsale:
            properties_list.append(properties)
    return properties_list
    # except:
    # 	print ("Failed to process the page",url)

In [None]:
zipcode = '10040'
sort = 'Homes For You'
print ("Fetching data for %s"%(zipcode))
scraped_data = parse(zipcode,sort)
print ("Writing data to output file")
with open("properties-%s.csv"%(zipcode),'wb')as csvfile:
    fieldnames = ['title','address','city','state','postal_code','price','facts and features','real estate provider','url']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in  scraped_data:
        writer.writerow(row)

Fetching data for 10040
200
Writing data to output file


In [17]:
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sys
import numpy as np
import pandas as pd
import regex as re
import requests
import lxml
from lxml.html.soupparser import fromstring
import prettify
import numbers

#set some display settings for notebooks
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#add headers in case you use chromedriver (captchas are no fun); namely used for chromedriver
req_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

#create url variables for each zillow page, at a time 10 page oepner
# accessing 10 page, sending request to 10 pages
with requests.Session() as s:
    city = 'atlanta/'  ## can be zip, or address, or street name
    
    url = 'https://www.zillow.com/homes/for_sale/'+city
    url2 = 'https://www.zillow.com/homes/for_sale/'+city+'/2_p/'
    url3 = 'https://www.zillow.com/homes/for_sale/'+city+'/3_p/'
    url4 = 'https://www.zillow.com/homes/for_sale/'+city+'/4_p/'
    url5 = 'https://www.zillow.com/homes/for_sale/'+city+'/5_p/'
    url6 = 'https://www.zillow.com/homes/for_sale/'+city+'/6_p/'
    url7 = 'https://www.zillow.com/homes/for_sale/'+city+'/7_p/'
    url8 = 'https://www.zillow.com/homes/for_sale/'+city+'/8_p/'
    url9 = 'https://www.zillow.com/homes/for_sale/'+city+'/9_p/'
    url10 = 'https://www.zillow.com/homes/for_sale/'+city+'/10_p/'

    r = s.get(url, headers=req_headers)
    r2 = s.get(url2, headers=req_headers)
    r3 = s.get(url3, headers=req_headers)
    r4 = s.get(url4, headers=req_headers)
    r5 = s.get(url5, headers=req_headers)
    r6 = s.get(url6, headers=req_headers)
    r7 = s.get(url7, headers=req_headers)
    r8 = s.get(url8, headers=req_headers)
    r9 = s.get(url9, headers=req_headers)
    r10 = s.get(url10, headers=req_headers)
    
    url_links = [url, url2, url3, url4, url5, url6, url7, url8, url9, url10]

#add contents of urls to soup variable from each url
soup = BeautifulSoup(r.content, 'html.parser')
soup1 = BeautifulSoup(r2.content, 'html.parser')
soup2 = BeautifulSoup(r3.content, 'html.parser')
soup3 = BeautifulSoup(r4.content, 'html.parser')
soup4 = BeautifulSoup(r5.content, 'html.parser')
soup5 = BeautifulSoup(r6.content, 'html.parser')
soup6 = BeautifulSoup(r7.content, 'html.parser')
soup7 = BeautifulSoup(r8.content, 'html.parser')
soup8 = BeautifulSoup(r9.content, 'html.parser')
soup9 = BeautifulSoup(r10.content, 'html.parser')

# page_links = [soup, soup1, soup2, soup3, soup4, soup5, soup6, soup7, soup8, soup9]

#create the first two dataframes
df = pd.DataFrame()
df1 = pd.DataFrame()

#all for loops are pulling the specified variable using beautiful soup and inserting into said variable
for i in soup:
    address = soup.find_all (class_= 'list-card-addr')
    price = list(soup.find_all (class_='list-card-price'))
    beds = list(soup.find_all("ul", class_="list-card-details"))
    details = soup.find_all ('div', {'class': 'list-card-details'})
    home_type = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link = soup.find_all (class_= 'list-card-link')
    
    #create dataframe columns out of variables
    df['prices'] = price
    df['address'] = address
    df['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag a
for link in soup.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df['links'] = urls
df['links'] = df['links'].astype('str')

#remove html tags
df['links'] = df['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df['links'] = df['links'].replace('" tabindex="0"></a>', ' ', regex=True)


for i in soup1:
    address1 = soup1.find_all (class_= 'list-card-addr')
    price1 = list(soup1.find_all (class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup1.find_all ('div', {'class': 'list-card-details'})
    home_type1 = soup1.find_all ('div', {'class': 'list-card-footer'})
    last_updated1 = soup1.find_all ('div', {'class': 'list-card-top'})
    brokerage1 = list(soup1.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link1 = soup1.find_all (class_= 'list-card-link')

    #create dataframe columns out of variables
    df1['prices'] = price1
    df1['address'] = address1
    df1['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup1.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df1['links'] = urls
df1['links'] = df1['links'].astype('str')

#remove html tags
df1['links'] = df1['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df1['links'] = df1['links'].replace('" tabindex="0"></a>', ' ', regex=True)

#append first two dataframes
df = df.append(df1, ignore_index = True) 

#create empty dataframes
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
df5 = pd.DataFrame()
df6 = pd.DataFrame()
df7 = pd.DataFrame()
df8 = pd.DataFrame()
df9 = pd.DataFrame()

for i in soup2:
    soup = soup2
    address = soup.find_all (class_= 'list-card-addr')
    price = list(soup.find_all (class_='list-card-price'))
    beds = list(soup.find_all("ul", class_="list-card-details"))
    details = soup.find_all ('div', {'class': 'list-card-details'})
    home_type = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link = soup.find_all (class_= 'list-card-link')
    
    #create dataframe columns out of variables
    df2['prices'] = price
    df2['address'] = address
    df2['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup2.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df2['links'] = urls
df2['links'] = df2['links'].astype('str')

#remove html tags
df2['links'] = df2['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df2['links'] = df2['links'].replace('" tabindex="0"></a>', ' ', regex=True)

    
for i in soup3:
    soup = soup3
    address1 = soup.find_all (class_= 'list-card-addr')
    price1 = list(soup.find_all (class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all ('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link1 = soup.find_all (class_= 'list-card-link')

    #create dataframe columns out of variables
    df3['prices'] = price1
    df3['address'] = address1
    df3['beds'] = beds1

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup3.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df3['links'] = urls
df3['links'] = df3['links'].astype('str')

#remove html tags
df3['links'] = df3['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df3['links'] = df3['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup4:
    soup = soup4
    address1 = soup.find_all (class_= 'list-card-addr')
    price1 = list(soup.find_all (class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all ('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link1 = soup.find_all (class_= 'list-card-link')

    #create dataframe columns out of variables
    df4['prices'] = price1
    df4['address'] = address1
    df4['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup4.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df4['links'] = urls
df4['links'] = df4['links'].astype('str')

#remove html tags
df4['links'] = df4['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df4['links'] = df4['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup5:
    soup = soup5
    address1 = soup.find_all (class_= 'list-card-addr')
    price1 = list(soup.find_all (class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all ('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link1 = soup.find_all (class_= 'list-card-link')

    #create dataframe columns out of variables
    df5['prices'] = price1
    df5['address'] = address1
    df5['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup5.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df5['links'] = urls
df5['links'] = df5['links'].astype('str')

#remove html tags
df5['links'] = df5['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df5['links'] = df5['links'].replace('" tabindex="0"></a>', ' ', regex=True)
    
for i in soup6:
    soup = soup6
    address1 = soup.find_all (class_= 'list-card-addr')
    price1 = list(soup.find_all (class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all ('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link1 = soup.find_all (class_= 'list-card-link')

    #create dataframe columns out of variables
    df6['prices'] = price1
    df6['address'] = address1
    df6['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup6.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df6['links'] = urls
df6['links'] = df6['links'].astype('str')

#remove html tags
df6['links'] = df6['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df6['links'] = df6['links'].replace('" tabindex="0"></a>', ' ', regex=True)
    
for i in soup7:
    soup = soup7
    address1 = soup.find_all (class_= 'list-card-addr')
    price1 = list(soup.find_all (class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all ('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link1 = soup.find_all (class_= 'list-card-link')

    #create dataframe columns out of variables
    df7['prices'] = price1
    df7['address'] = address1
    df7['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup7.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df7['links'] = urls
df7['links'] = df7['links'].astype('str')

#remove html tags
df7['links'] = df7['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df7['links'] = df7['links'].replace('" tabindex="0"></a>', ' ', regex=True)
    
for i in soup8:
    soup = soup8
    address1 = soup.find_all (class_= 'list-card-addr')
    price1 = list(soup.find_all (class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all ('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link1 = soup.find_all (class_= 'list-card-link')

    #create dataframe columns out of variables
    df8['prices'] = price1
    df8['address'] = address1
    df8['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup8.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df8['links'] = urls
df8['links'] = df8['links'].astype('str')

#remove html tags
df8['links'] = df8['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df8['links'] = df8['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup9:
    soup = soup9
    address1 = soup.find_all (class_= 'list-card-addr')
    price1 = list(soup.find_all (class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all ('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all ('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all ('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_= 'list-card-brokerage list-card-img-overlay',text=True))
    link1 = soup.find_all (class_= 'list-card-link')

    #create dataframe columns out of variables
    df9['prices'] = price1
    df9['address'] = address1
    df9['beds'] = beds

#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup9.find_all("article"):
    href = link.find('a',class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

#import urls into a links column
df9['links'] = urls
df9['links'] = df9['links'].astype('str')

#remove html tags
df9['links'] = df9['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df9['links'] = df9['links'].replace('" tabindex="0"></a>', ' ', regex=True)

df = df.append(df2, ignore_index = True) 
df = df.append(df3, ignore_index = True) 
df = df.append(df4, ignore_index = True) 
df = df.append(df5, ignore_index = True) 
df = df.append(df6, ignore_index = True) 
df = df.append(df7, ignore_index = True) 
df = df.append(df8, ignore_index = True) 
df = df.append(df9, ignore_index = True) 

#convert columns to str
df['prices'] = df['prices'].astype('str')
df['address'] = df['address'].astype('str')
df['beds'] = df['beds'].astype('str')

#remove html tags
df['prices'] = df['prices'].replace('<div class="list-card-price">', ' ', regex=True)
df['address'] = df['address'].replace('<address class="list-card-addr">', ' ', regex=True)
df['prices'] = df['prices'].replace('</div>', ' ', regex=True)
df['address'] = df['address'].replace('</address>', ' ', regex=True)
df['prices'] = df['prices'].str.replace(r'\D', '')

#remove html tags from beds column
df['beds'] = df['beds'].replace('<ul class="list-card-details"><li>', ' ', regex=True)
df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->bds</abbr></li><li>', ' ', regex=True)
df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->ba</abbr></li><li>', ' ', regex=True)
df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->bd</abbr></li><li>', ' ', regex=True)
df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->sqft</abbr></li></ul>', ' ', regex=True)
df['beds'] = df['beds'].replace('Studio</li><li>', '0 ', regex=True)

#split beds column into beds, bath and sq_feet
#df[['beds']] = df.beds.str.split(expand=True)

#remove commas from sq_feet and convert to float
df.replace(',','', regex=True, inplace=True)

#drop nulls
df = df[(df['prices'] != '') & (df['prices']!= ' ')]

#convert column to float
df['prices'] = df['prices'].astype('float')
#df['sq_feet'] = df['sq_feet'].astype('float')

print('The column datatypes are:')
print(df.dtypes)
print('The dataframe shape is:', df.shape)

#rearrange the columns
#df = df[['prices', 'address', 'links', 'beds', 'baths', 'sq_feet']]

df.to_csv("PIE.zillow.csv")

The column datatypes are:
prices     float64
address     object
beds        object
links       object
dtype: object
The dataframe shape is: (400, 4)


In [13]:
!pip install prettify

Collecting prettify
  Downloading https://files.pythonhosted.org/packages/bf/41/4b8104d757fa78949eb38eaa74a6e4046fcb8d67ea85e762361c175423a8/prettify-0.1.1.tar.gz
Building wheels for collected packages: prettify
  Building wheel for prettify (setup.py) ... [?25l[?25hdone
  Created wheel for prettify: filename=prettify-0.1.1-cp36-none-any.whl size=8424 sha256=020932a88684bc10a4ea8669843ad1bcc86018806e1250d45978441c6aa47c36
  Stored in directory: /root/.cache/pip/wheels/49/44/c3/f997cb138383a4a0399a6b2f75012746c533a24ff6b02fafe0
Successfully built prettify
Installing collected packages: prettify
Successfully installed prettify-0.1.1


In [7]:
!pip install selenium

Collecting selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K     |████████████████████████████████| 911kB 5.5MB/s 
Installing collected packages: selenium
Successfully installed selenium-3.141.0
