In [6]:
# To get latest file in directory
import glob
import os

import pandas as pd
import datetime
from time import sleep
import time

# Beautiful soup for web scraping
import requests
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

from random import randrange

In [4]:
"""
Functions to retrieve data from Zoopla
"""

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def rental_price(d):
    num_beds = d['num_bedrooms']
    if num_beds == 0:
        num_beds = 1
    if d['property_type'] == "Flat" or d['property_type'] == "Studio":
        property_type = "flats"
    else:
        property_type = "houses"
    for x in html.select('td'):
        if '/to-rent/'+property_type+'/'+str(num_beds)+'-bedroom' in str(x):
            return int(str(x).split("\xa3")[1].split(" pcm")[0].replace(',',''))
        
def sale_price(d):
    num_beds = d['num_bedrooms']
    if num_beds == 0:
        num_beds = 1
    if d['property_type'] == "Flat" or d['property_type'] == "Studio" or d['property_type'] == "Block of flats":
        property_type = "flats"
    else:
        property_type = "houses"
    for x in html.select('td'):
        if '/for-sale/'+property_type+'/'+str(num_beds)+'-bedroom' in str(x):
            return int(str(x).split("\xa3")[1].split("</strong>")[0].replace(',',''))
        
def price_put_on_market():
    for x in html.select('p'):
        if "<strong>First listed</strong>" in str(x):
            return int(str(x).split("\xa3")[1].split(" on")[0].replace(',',''))    
    
def date_put_on_market():
    for x in html.select('p'):
        if "<strong>First listed</strong>" in str(x):
            for row in str(x).split(" on")[1].splitlines():
                if len(row) > 8:
                    return row
                
def page_views():
    strong = []
    for x in html.select('p'):
        if "<strong>Page views</strong>" in str(x):
            for row in str(x).split("days:")[1].splitlines():
                if "<strong>" in row:
                    strong.append(row.split("<strong>")[1].split("</strong>")[0].replace(',','')) 
    return strong

def local_area_ratings():
    stars = []
    for x in html.select('li'):
        if "current-rating" in str(x):
            stars.append(float(str(x).split("currently ")[1].split(" stars")[0]))
    return stars

def missing_zed_index():
    lis = []
    c = 0
    for x in html.select('span'):
        c += 1
        if "js-market-stats-average-value" in str(x):
            lis.append(str(x).split('js-market-stats-average-value" data-value-all="')[1].split(',')[0])
    return int(lis[0])

In [9]:
"""
Script to parse property listings and pull extra information from property URL.
Run again with line 15 and 19 commented out, and 18 uncommented to fill in blocked keys.
"""

# Reads file from previous run
list_of_files = glob.glob('.../zedindex/*')
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file, index_col=0)
df = df.transpose()
l = df.to_dict()

t0 = time.time()
count = 0
blocked_keys = [] # Comment out when running to fill in blocked keys
print len(l.keys())

# for key in blocked_keys:
for key in l.keys():
    print l[key]['details_url']
    print count
    count += 1
    try:
        if 'rental_price' in l[key].keys():
            if any( [ l[key]['rental_price'] != l[key]['rental_price'], l[key]['overall_rating'] != l[key]['overall_rating'] ] ):
                print "not equal"
                zoopla = simple_get(l[key]['details_url'])
                if zoopla is None:
                    raise TypeError
                else:
                    html = BeautifulSoup(zoopla, 'html.parser')
                    l[key]['rental_price'] = rental_price(l[key])
                    l[key]['sale_price'] = sale_price(l[key])
                    l[key]['price_put_on_market'] = price_put_on_market()
                    l[key]['date_put_on_market'] = date_put_on_market()
                    l[key]['page_views_30_days'] = int(page_views()[0])
                    if len(page_views()) > 1:
                        l[key]['page_views_all_time'] = int(page_views()[1])
                    l[key]['overall_rating'] = local_area_ratings()[0]
                    l[key]['community_and_safety'] = local_area_ratings()[1]
                    l[key]['entertainment_and_nightlife'] = local_area_ratings()[2]
                    l[key]['parks_and_recreation'] = local_area_ratings()[3]
                    l[key]['restaurants_and_shopping'] = local_area_ratings()[4]
                    l[key]['schools_and_public_services'] = local_area_ratings()[5]
                    l[key]['transport_and_travel'] = local_area_ratings()[6]
                    if l[key]['zed_index'] < 1:
                        l[key]['zed_index'] = missing_zed_index()
                    time.sleep(randrange(100,200)/100.0)
    except IndexError:
        pass
    except TypeError:
        try:
            print "Waiting 15 seconds"
            time.sleep(15)
            zoopla = simple_get(l[key]['details_url'])
            if zoopla is None:
                raise TypeError
            else:
                html = BeautifulSoup(zoopla, 'html.parser')
                l[key]['rental_price'] = rental_price(l[key])
                l[key]['sale_price'] = sale_price(l[key])
                l[key]['price_put_on_market'] = price_put_on_market()
                l[key]['date_put_on_market'] = date_put_on_market()
                l[key]['page_views_30_days'] = int(page_views()[0])
                if len(page_views()) > 1:
                    l[key]['page_views_all_time'] = int(page_views()[1])
                l[key]['overall_rating'] = local_area_ratings()[0]
                l[key]['community_and_safety'] = local_area_ratings()[1]
                l[key]['entertainment_and_nightlife'] = local_area_ratings()[2]
                l[key]['parks_and_recreation'] = local_area_ratings()[3]
                l[key]['restaurants_and_shopping'] = local_area_ratings()[4]
                l[key]['schools_and_public_services'] = local_area_ratings()[5]
                l[key]['transport_and_travel'] = local_area_ratings()[6]
        except TypeError:
            print "Blocked."
            blocked_keys.append(key)
            pass
    
t1 = time.time()
print (t1-t0)/60., " minutes."
df = df.from_dict(l)
df = df.transpose()
now = datetime.datetime.now().strftime("%H%M%d%m%y")
df.to_csv(".../scraper/"+str(now)+".csv",encoding='utf-8')

6272
https://www.zoopla.co.uk/for-sale/details/46891011?utm_source=v1:_aWMGz_eWRnA-0qvd6g55GL0LE1r1baI&utm_medium=api
0
not equal
https://www.zoopla.co.uk/for-sale/details/46858255?utm_source=v1:_aWMGz_eWRnA-0qvd6g55GL0LE1r1baI&utm_medium=api
1
https://www.zoopla.co.uk/for-sale/details/46792736?utm_source=v1:_aWMGz_eWRnA-0qvd6g55GL0LE1r1baI&utm_medium=api
2
not equal
https://www.zoopla.co.uk/for-sale/details/45482017?utm_source=v1:_aWMGz_eWRnA-0qvd6g55GL0LE1r1baI&utm_medium=api
3
not equal
0.153276817004  minutes.


In [None]:
blocked_keys