# A complete pipeline for pulling current properties from Zoopla, adding commute times to two workplaces using Google Maps API, scraping extra information from the web

In [None]:
"""
All imports
"""

import zoopla # Python wrapper for Zoopla API. Installed through pip.
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.6f' % x,'display.max_columns', None, 'display.max_rows', None)

# To get latest file in| directory
import glob
import os

import datetime
from time import sleep
import googlemaps # Python wrapper for Google Maps API. Installed through pip.
import time

# Beautiful soup for web scraping
import requests
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

from random import randrange

# For simple linear regression on area values
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression


## Zoopla Property Listing

In [None]:
"""
API keys
"""

current = []  # List to assist in deleting old listings
c=0 # Counter 
api = {'API_1':'...','API_2':'...'}

In [None]:
"""
Search parameters. Needs to be in the form of a radius from a certain point, so here it is set to Waterloo Stations lat/long, with a 20 mile radius
"""

params = {  'maximum_beds': 2,
            'page_size': 100,
            'page_number': 1,
            'listing_status': 'sale',
            'latitude': '51.5032',
            'longitude': '-0.1123',
            'radius': 20,
            'maximum_price': 400000,
            'minimum_price': 250000,
            'order_by': 'age',
            'ordering': 'descending'}

In [None]:
"""
ONLY NEEDS TO BE RUN IF LOADING FROM PREVIOUS FILE
"""

# Script to join full search with < 60 minutes subsection before updating (to save appending Google Maps and all other data again). 

# latest full
list_of_files = glob.glob('.../gmaps/*')
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file,index_col=0)
# df = df.transpose()
df = df.drop(['agent_address','agent_logo','category','country','country_code','image_url','street_name'],axis=1) 

# latest subsection
list_of_files = glob.glob('.../final/*')
latest_file = max(list_of_files, key=os.path.getctime)
d2 = pd.read_csv(latest_file,index_col=0) 

print "df shape: ",df.shape
print "d2 shape: ",d2.shape

# Combines subsection with extra details with full set containing all distances for comparison against new search. Leaves duplicates of subsection, one old without extra details e.g. zed index and one with.
d3 = df.merge(d2, on=df.columns.tolist(),how='outer')
print "original d3 shape: ",d3.shape

# Sort zed index to place filled subsection first, then delete the empty subsection to deduplicate
d3 = d3.sort_values('zed_index')
d3['listing_id'] = d3['listing_id'].astype('int')
d3['listing_id'] = d3['listing_id'].astype('string')
d3 = d3.drop_duplicates(subset='listing_id',keep='first')
print "deduplicated d3 shape: ",d3.shape
d3 = d3.set_index('listing_id',drop=False)
del d3.index.name
now = datetime.datetime.now().strftime("%H%M%d%m%y")
d3.to_csv(".../join/"+str(now)+".csv",encoding='utf-8')

In [None]:
"""
Won't work if > 20,000 results - max limit 10,000 (page num 100 * page size 100) for each API key.
Luckily within my search parameters there are roughly 13,000 results, so I am able to order by ascending with one API key then descending to get the remaining 3,000.
"""


# From scratch
l = {}

# Reads file from previous run

# list_of_files = glob.glob('.../join/*')
# latest_file = max(list_of_files, key=os.path.getctime)
# df = pd.read_csv(latest_file,index_col=0)
# df.index = df.index.astype('string')
# df = df.transpose()
# l = df.to_dict()


class GetOutOfLoop( Exception ):
    pass

try:
    try:
        zoop = zoopla.Zoopla(api_key=api['API_1'])
        search = zoop.property_listings(params)
        while len(search.listing) != 0:
            print "Page number: ",params['page_number']
            print "Result count : ",search.result_count
            print "API_1"
            search = zoop.property_listings(params)
            for i in range(len(search.listing)):
                if search.listing[i].listing_id in current:
                    pass
                else:
                    current.append(search.listing[i].listing_id)
                if search.listing[i].listing_id in l.keys():
                    c+=1
                else:
                    l[search.listing[i].listing_id]=search.listing[i]
            params['page_number']+=1
        else:
            raise GetOutOfLoop
        
    except zoopla.exceptions.ZooplaAPIException: 
        if params['page_number'] < 100:
            print "API_1 Page number ",params['page_number'], " less than expected. API limit reached, waiting until next full hour."
            now = datetime.datetime.now()
            h = int(now.strftime("%H"))+1
            t = now.replace(hour=h,minute=0,second=0)
            wait = t - now
            print "Waiting for ",wait.seconds/60," minutes."
            sleep(wait.seconds)
            pass 
        
        try:
            zoop = zoopla.Zoopla(api_key=api['API_2'])
            params['ordering']='ascending'
            params['page_number']=1
            while len(search.listing) != 0:
                print "Page number: ",params['page_number']
                print "Result count : ",search.result_count
                print "API_2"
                search = zoop.property_listings(params)
                for i in range(len(search.listing)):
                    if search.listing[i].listing_id in current:
                        pass
                    else:
                        current.append(search.listing[i].listing_id)
                    if search.listing[i].listing_id in l.keys():
                        c+=1
                    else:
                        l[search.listing[i].listing_id]=search.listing[i]
                params['page_number']+=1
            else:
                raise GetOutOfLoop
                
        except zoopla.exceptions.ZooplaAPIException:
            if params['page_number'] < 100:
                print "API_2 Page number ",params['page_number'], " less than expected. API limit reached, waiting until next full hour."
                now = datetime.datetime.now()
                h = int(now.strftime("%H"))+1
                t = now.replace(hour=h,minute=0,second=0)
                wait = t - now
                print now
                print "Waiting for ",wait.seconds/60," minutes."
                sleep(wait.seconds)
                pass
            
except GetOutOfLoop:
    print "Reached end."
    pass      

for x in l.keys():
    if x in current:
        pass
    else:
        l.pop(x, None)
    
print "Finished both 10,000 cycles."
print "Added ", search.result_count - c
print "Length l: ",len(l)

# Save dictionary using pandas
now = datetime.datetime.now().strftime("%H%M%d%m%y")
print str(datetime.datetime.now())
df = df.from_dict(l)
df = df.transpose()
df.to_csv(".../Zoopla/"+str(now)+".csv",encoding='utf-8')

In [None]:
"""
Checks difference in new property listings
"""

list_of_files = glob.glob('.../join/*') # Old
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file,index_col=0)
# df = df.transpose()

list_of_files = glob.glob('.../Zoopla/*') # New
latest_file = max(list_of_files, key=os.path.getctime)
d2 = pd.read_csv(latest_file,index_col=0)
# d2 = d2.transpose()

print "df shape: ",df.shape
print "d2 shape: ",d2.shape
dfi = df.index.tolist()
d2i = d2.index.tolist()
dfi = [str(i) for i in dfi]
d2i = [str(i) for i in d2i]
print dfi[:5]
print d2i[:5]
print "Difference: ", len(set(dfi).symmetric_difference(set(d2i)))
new = set(dfi)-set(d2i)
print "Length: new ",len(new)

# print new
old = set(d2i)-set(dfi)
print "Length: old ",len(old)

# print old
now = datetime.datetime.now().strftime("%H%M%d%m%y")
new_df = pd.DataFrame(list(new))
new_df.to_csv(".../new/"+str(now)+".csv",encoding='utf-8',header=False,index=False)
print "Written new list." # This is used to conditionally format spreadsheet with new properties this run.

## Adds google maps times to zoopla property listings

In [None]:
# t can only be 8 days in the past.
t = datetime.datetime(2018,4,16,9)
print t

In [None]:
def distance_work_1(address):
    directions_result = gmaps.directions(address,
                                         "...",
                                         mode="transit",
                                         arrival_time=t)
    try:
        a = directions_result[0]["legs"][0]["duration"]["text"]
        return convert_to_mins(a)
    except IndexError:
        print "Address empty             ", address

In [None]:
def distance_work_2(address):
    directions_result = gmaps.directions(address,
                                         "...",
                                         mode="transit",
                                         arrival_time=t)
    try:
        a = directions_result[0]["legs"][0]["duration"]["text"]
        return convert_to_mins(a)
    except IndexError:
        print "Address empty             ", address

In [None]:
def cycling_work_2(address):
    directions_result = gmaps.directions(address,
                                         "...",
                                         mode="bicycling",
                                         arrival_time=t)
    try:
        a = directions_result[0]["legs"][0]["duration"]["text"]
        return convert_to_mins(a)
    except IndexError:
        print "Address empty             ", address

In [None]:
def convert_to_mins(s):
    if len(str(s).split(" "))==2:
        return int(s.split(" ")[0])
    if len(str(s).split(" "))==4:
        return int(s.split(" ")[0])*60 + int(s.split(" ")[2])

In [None]:
"""
Google Maps API's
"""

gmapsAPI = {'API_1':'...', 'API_2':'...'}

In [None]:
"""
Adds commute times
"""

# Reads file from previous run
list_of_files = glob.glob('.../Zoopla/*')
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file, index_col=0)
df = df.transpose()
l = df.to_dict()

c = 0
written = 0
passed = 0

a = 0 # API key list iterator
a_list = ['gmaps_API_1', 'gmaps_API_2']
gmaps = googlemaps.Client(key=gmapsAPI[a_list[a]])

class GetOutOfLoop( Exception ):
    pass

t0 = time.time()

try:
    for i in l.keys():
        print c
        c+=1
        address = l[i]['displayable_address']
        length = len(address.split(","))
        while True:
            try:
                if any( [ l[i]['distance_work_1'] != l[i]['distance_work_1'], 
                          l[i]['distance_work_2'] != l[i]['distance_work_2'],
                          l[i]['distance_work_1'] == l[i]['distance_work_2'],
                          any(x not in l[i].keys() for x in ['distance_work_1','distance_work_2']) ] ):
                    if l[i]['distance_work_1'] == l[i]['distance_work_2']:
                        for x in range(length):
                            try:
                                l[i]['distance_work_2'] = distance_work_2(','.join(address.split(",")[x:]))
                            except (googlemaps.exceptions.ApiError, IndexError, TypeError) as e:
                                print "Address failed:          ", ','.join(address.split(",")[x:])
                            else:
                                written += 1
                                break
                    else:
                        for x in range(length):
                            try:
                                l[i]['distance_work_1'] = distance_work_1(','.join(address.split(",")[x:]))
                            except (googlemaps.exceptions.ApiError, IndexError, TypeError) as e:
                                print "Address failed:          ", ','.join(address.split(",")[x:])
                            else:
                                l[i]['distance_work_2'] = distance_work_2(','.join(address.split(",")[x:]))
                                written += 1
                                break
                else:
                    passed += 1
     
                break
        
            except (googlemaps.exceptions.Timeout, googlemaps.exceptions.TransportError, googlemaps.exceptions.HTTPError) as e:
                if a <= 3:
                    a += 1
                    gmaps = googlemaps.Client(key=gmapsAPI[a_list[a]])
                    print "Changed API key to: ", a_list[a]
                else:
                    raise GetOutOfLoop
    raise GetOutOfLoop
                
except GetOutOfLoop:
    print "API limits reached"
    t1 = time.time()
    total = t1-t0
    total_length = len(l.keys())
    print "Minutes: ", total/60
    print "Length of dictionary: ", total_length
    print "Length covered: ", c
    print "Passed: ", passed
    print "Written: ", written 
    df = df.from_dict(l)
    df = df.transpose()
    now = datetime.datetime.now().strftime("%H%M%d%m%y")
    df.to_csv(".../gmaps/"+str(now)+".csv",encoding='utf-8')


Now I need to filter down to less than 60 minutes each.

In [None]:
list_of_files = glob.glob('.../gmaps/*')
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file,index_col=0)
d4 = df.copy()
d4['distance_work_2'] = pd.to_numeric(d4['distance_work_2'])
d4['distance_work_1'] = pd.to_numeric(d4['distance_work_1'])
d4 = d4[(d4['distance_work_2']<=60)&(d4['distance_work_1']<=60)]
d4.shape
now = datetime.datetime.now().strftime("%H%M%d%m%y")
d4.to_csv(".../filtered/"+str(now)+".csv",encoding='utf-8')

In [None]:
"""
Adds work 2 cycling time
"""

# Reads file from previous run
list_of_files = glob.glob('.../filtered/*')
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file, index_col=0)
df = df.transpose()
l = df.to_dict()

c = 0
written = 0
passed = 0

a = 0
a_list = ['gmaps_API_1', 'gmaps_API_2']
gmaps = googlemaps.Client(key=gmapsAPI[a_list[a]])

class GetOutOfLoop( Exception ):
    pass

t0 = time.time()

try:
    for i in l.keys():
        print c
        c+=1
        address = l[i]['displayable_address']
        length = len(address.split(","))
        while True:
            try:
                if any( [ l[i]['cycling_work_2'] != l[i]['cycling_work_2'], 
                          'cycling_work_2' not in l[i].keys() ] ):
                    for x in range(length):
                        try:
                            l[i]['cycling_work_2'] = cycling_work_2(','.join(address.split(",")[x:]))
                        except (googlemaps.exceptions.ApiError, IndexError, TypeError) as e:
                            print "Address failed:          ", ','.join(address.split(",")[x:])
                        else:
                            written += 1
                            break

                else:
                    passed += 1
     
                break
        
            except (googlemaps.exceptions.Timeout, googlemaps.exceptions.TransportError, googlemaps.exceptions.HTTPError) as e:
                if a <= 3:
                    a += 1
                    gmaps = googlemaps.Client(key=gmapsAPI[a_list[a]])
                    print "Changed API key to: ", a_list[a]
                else:
                    raise GetOutOfLoop
    raise GetOutOfLoop
                
except GetOutOfLoop:
    print "API limits reached"
    t1 = time.time()
    total = t1-t0
    total_length = len(l.keys())
    print "Minutes: ", total/60
    print "Length of dictionary: ", total_length
    print "Length covered: ", c
    print "Passed: ", passed
    print "Written: ", written 
    df = df.from_dict(l)
    df = df.transpose()
    now = datetime.datetime.now().strftime("%H%M%d%m%y")
    df.to_csv(".../filtered/"+str(now)+"CYCLING.csv",encoding='utf-8')


## Zed Index Area Value

In [None]:
"""
Keeps running until all from original data have a zed index. Each API key can manage 100/hour so recommended to filter down to only the ones most interested in 
e.g. Commute times both within a range
"""

import zoopla
list_of_files = glob.glob('.../filtered/*') # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file,index_col=0)
# df = df.transpose()

c = -1

t0 = time.time()

for i, row in df.iterrows():
    c += 1
    print c
    address = row["outcode"]
    if 'zed_index' not in row.keys():
        try:
            zoop = zoopla.Zoopla(api_key=api['API_1'])
            print "API_1 Row: ",c
            zed_index = zoop.zed_index({'area': address,'output_type': 'outcode'})
            df.loc[i,"zed_index"] = zed_index["zed_index"]
            df.loc[i,"zed_index_3month"] = zed_index["zed_index_3month"]
            df.loc[i,"zed_index_6month"] = zed_index["zed_index_6month"]
            df.loc[i,"zed_index_1year"] = zed_index["zed_index_1year"]
            df.loc[i,"zed_index_2year"] = zed_index["zed_index_2year"]
            df.loc[i,"zed_index_3year"] = zed_index["zed_index_3year"]
            df.loc[i,"zed_index_4year"] = zed_index["zed_index_4year"]
            df.loc[i,"zed_index_5year"] = zed_index["zed_index_5year"]

        except zoopla.exceptions.ZooplaAPIException: 
            try:
                zoop = zoopla.Zoopla(api_key=api['API_2'])
                print "API_2 Row: ",c
                zed_index = zoop.zed_index({'area': address,'output_type': 'outcode'})
                df.loc[i,"zed_index"] = zed_index["zed_index"]
                df.loc[i,"zed_index_3month"] = zed_index["zed_index_3month"]
                df.loc[i,"zed_index_6month"] = zed_index["zed_index_6month"]
                df.loc[i,"zed_index_1year"] = zed_index["zed_index_1year"]
                df.loc[i,"zed_index_2year"] = zed_index["zed_index_2year"]
                df.loc[i,"zed_index_3year"] = zed_index["zed_index_3year"]
                df.loc[i,"zed_index_4year"] = zed_index["zed_index_4year"]
                df.loc[i,"zed_index_5year"] = zed_index["zed_index_5year"]


            except zoopla.exceptions.ZooplaAPIException:
                now = datetime.datetime.now()
                h = int(now.strftime("%H"))+1
                t = now.replace(hour=h,minute=0,second=0)
                wait = t - now
                print now
                print "Waiting for 60 minutes."
                sleep(3600)

                zoop = zoopla.Zoopla(api_key=api['API_1'])
                print "API_1 Row: ",c
                zed_index = zoop.zed_index({'area': address,'output_type': 'outcode'})
                df.loc[i,"zed_index"] = zed_index["zed_index"]
                df.loc[i,"zed_index_3month"] = zed_index["zed_index_3month"]
                df.loc[i,"zed_index_6month"] = zed_index["zed_index_6month"]
                df.loc[i,"zed_index_1year"] = zed_index["zed_index_1year"]
                df.loc[i,"zed_index_2year"] = zed_index["zed_index_2year"]
                df.loc[i,"zed_index_3year"] = zed_index["zed_index_3year"]
                df.loc[i,"zed_index_4year"] = zed_index["zed_index_4year"]
                df.loc[i,"zed_index_5year"] = zed_index["zed_index_5year"]
    if 'zed_index' in row.keys():
        if row["zed_index"] != row["zed_index"]:
            try:
                zoop = zoopla.Zoopla(api_key=api['API_1'])
                print "API_1 Row: ",c
                zed_index = zoop.zed_index({'area': address,'output_type': 'outcode'})
                df.loc[i,"zed_index"] = zed_index["zed_index"]
                df.loc[i,"zed_index_3month"] = zed_index["zed_index_3month"]
                df.loc[i,"zed_index_6month"] = zed_index["zed_index_6month"]
                df.loc[i,"zed_index_1year"] = zed_index["zed_index_1year"]
                df.loc[i,"zed_index_2year"] = zed_index["zed_index_2year"]
                df.loc[i,"zed_index_3year"] = zed_index["zed_index_3year"]
                df.loc[i,"zed_index_4year"] = zed_index["zed_index_4year"]
                df.loc[i,"zed_index_5year"] = zed_index["zed_index_5year"]

            except zoopla.exceptions.ZooplaAPIException: 
                try:
                    zoop = zoopla.Zoopla(api_key=api['API_2'])
                    print "API_2 Row: ",c
                    zed_index = zoop.zed_index({'area': address,'output_type': 'outcode'})
                    df.loc[i,"zed_index"] = zed_index["zed_index"]
                    df.loc[i,"zed_index_3month"] = zed_index["zed_index_3month"]
                    df.loc[i,"zed_index_6month"] = zed_index["zed_index_6month"]
                    df.loc[i,"zed_index_1year"] = zed_index["zed_index_1year"]
                    df.loc[i,"zed_index_2year"] = zed_index["zed_index_2year"]
                    df.loc[i,"zed_index_3year"] = zed_index["zed_index_3year"]
                    df.loc[i,"zed_index_4year"] = zed_index["zed_index_4year"]
                    df.loc[i,"zed_index_5year"] = zed_index["zed_index_5year"]


                except zoopla.exceptions.ZooplaAPIException:
                    now = datetime.datetime.now()
                    h = int(now.strftime("%H"))+1
                    t = now.replace(hour=h,minute=0,second=0)
                    wait = t - now
                    print now
                    print "Waiting for 60 minutes."
                    sleep(3600)

                    zoop = zoopla.Zoopla(api_key=api['API_1'])
                    print "API_1 Row: ",c
                    zed_index = zoop.zed_index({'area': address,'output_type': 'outcode'})
                    df.loc[i,"zed_index"] = zed_index["zed_index"]
                    df.loc[i,"zed_index_3month"] = zed_index["zed_index_3month"]
                    df.loc[i,"zed_index_6month"] = zed_index["zed_index_6month"]
                    df.loc[i,"zed_index_1year"] = zed_index["zed_index_1year"]
                    df.loc[i,"zed_index_2year"] = zed_index["zed_index_2year"]
                    df.loc[i,"zed_index_3year"] = zed_index["zed_index_3year"]
                    df.loc[i,"zed_index_4year"] = zed_index["zed_index_4year"]
                    df.loc[i,"zed_index_5year"] = zed_index["zed_index_5year"]

print "Finished both 10,000 cycles."
# df = df.transpose()
now = datetime.datetime.now().strftime("%H%M%d%m%y")
df.to_csv(".../zedindex/"+str(now)+".csv",encoding='utf-8')
t1 = time.time()
total = t1-t0
print "Time: ", total


In [None]:
# Check how many are left

# df = df.transpose()
df['zed_index'].isnull().sum()

In [None]:
df.shape

## Web scraper to add extra information such as rental price, area ratings etc.

In [None]:
"""
Functions to retrieve data from Zoopla
"""

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def rental_price(d):
    num_beds = d['num_bedrooms']
    if num_beds == 0:
        num_beds = 1
    if d['property_type'] == "Flat" or d['property_type'] == "Studio":
        property_type = "flats"
    else:
        property_type = "houses"
    for x in html.select('td'):
        if '/to-rent/'+property_type+'/'+str(num_beds)+'-bedroom' in str(x):
            return int(str(x).split("\xa3")[1].split(" pcm")[0].replace(',',''))
        
def sale_price(d):
    num_beds = d['num_bedrooms']
    if num_beds == 0:
        num_beds = 1
    if d['property_type'] == "Flat" or d['property_type'] == "Studio" or d['property_type'] == "Block of flats":
        property_type = "flats"
    else:
        property_type = "houses"
    for x in html.select('td'):
        if '/for-sale/'+property_type+'/'+str(num_beds)+'-bedroom' in str(x):
            return int(str(x).split("\xa3")[1].split("</strong>")[0].replace(',',''))
        
def price_put_on_market():
    for x in html.select('p'):
        if "<strong>First listed</strong>" in str(x):
            return int(str(x).split("\xa3")[1].split(" on")[0].replace(',',''))    
    
def date_put_on_market():
    for x in html.select('p'):
        if "<strong>First listed</strong>" in str(x):
            for row in str(x).split(" on")[1].splitlines():
                if len(row) > 8:
                    return row
                
def page_views():
    strong = []
    for x in html.select('p'):
        if "<strong>Page views</strong>" in str(x):
            for row in str(x).split("days:")[1].splitlines():
                if "<strong>" in row:
                    strong.append(row.split("<strong>")[1].split("</strong>")[0].replace(',','')) 
    return strong

def local_area_ratings():
    stars = []
    for x in html.select('li'):
        if "current-rating" in str(x):
            stars.append(float(str(x).split("currently ")[1].split(" stars")[0]))
    return stars

def missing_zed_index():
    lis = []
    c = 0
    for x in html.select('span'):
        c += 1
        if "js-market-stats-average-value" in str(x):
            lis.append(str(x).split('js-market-stats-average-value" data-value-all="')[1].split(',')[0])
    return int(lis[0])

In [None]:
list_of_files = glob.glob('.../zedindex/*')
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file, index_col=0)
df.shape

In [None]:
"""
Script to parse property listings and pull extra information from property URL.
Run again with line 15 and 19 commented out, and 18 uncommented to fill in blocked keys.
"""

# Reads file from previous run
list_of_files = glob.glob('.../zedindex/*')
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file, index_col=0)
df = df.transpose()
l = df.to_dict()

t0 = time.time()
count = 0
blocked_keys = [] # Comment out when running to fill in blocked keys
print len(l.keys())

# for key in blocked_keys:
for key in l.keys():
    print l[key]['details_url']
    print count
    count += 1
    try:
        if 'rental_price' in l[key].keys():
            if any( [ l[key]['rental_price'] != l[key]['rental_price'], l[key]['overall_rating'] != l[key]['overall_rating'] ] ):
                print "not equal"
                zoopla = simple_get(l[key]['details_url'])
                if zoopla is None:
                    raise TypeError
                else:
                    html = BeautifulSoup(zoopla, 'html.parser')
                    l[key]['rental_price'] = rental_price(l[key])
                    l[key]['sale_price'] = sale_price(l[key])
                    l[key]['price_put_on_market'] = price_put_on_market()
                    l[key]['date_put_on_market'] = date_put_on_market()
                    l[key]['page_views_30_days'] = int(page_views()[0])
                    if len(page_views()) > 1:
                        l[key]['page_views_all_time'] = int(page_views()[1])
                    l[key]['overall_rating'] = local_area_ratings()[0]
                    l[key]['community_and_safety'] = local_area_ratings()[1]
                    l[key]['entertainment_and_nightlife'] = local_area_ratings()[2]
                    l[key]['parks_and_recreation'] = local_area_ratings()[3]
                    l[key]['restaurants_and_shopping'] = local_area_ratings()[4]
                    l[key]['schools_and_public_services'] = local_area_ratings()[5]
                    l[key]['transport_and_travel'] = local_area_ratings()[6]
                    if l[key]['zed_index'] < 1:
                        l[key]['zed_index'] = missing_zed_index()
                    time.sleep(randrange(100,200)/100.0)
    except IndexError:
        pass
    except TypeError:
        try:
            print "Waiting 15 seconds"
            time.sleep(15)
            zoopla = simple_get(l[key]['details_url'])
            if zoopla is None:
                raise TypeError
            else:
                html = BeautifulSoup(zoopla, 'html.parser')
                l[key]['rental_price'] = rental_price(l[key])
                l[key]['sale_price'] = sale_price(l[key])
                l[key]['price_put_on_market'] = price_put_on_market()
                l[key]['date_put_on_market'] = date_put_on_market()
                l[key]['page_views_30_days'] = int(page_views()[0])
                if len(page_views()) > 1:
                    l[key]['page_views_all_time'] = int(page_views()[1])
                l[key]['overall_rating'] = local_area_ratings()[0]
                l[key]['community_and_safety'] = local_area_ratings()[1]
                l[key]['entertainment_and_nightlife'] = local_area_ratings()[2]
                l[key]['parks_and_recreation'] = local_area_ratings()[3]
                l[key]['restaurants_and_shopping'] = local_area_ratings()[4]
                l[key]['schools_and_public_services'] = local_area_ratings()[5]
                l[key]['transport_and_travel'] = local_area_ratings()[6]
        except TypeError:
            print "Blocked."
            blocked_keys.append(key)
            pass
    
t1 = time.time()
print (t1-t0)/60., " minutes."
df = df.from_dict(l)
df = df.transpose()
now = datetime.datetime.now().strftime("%H%M%d%m%y")
df.to_csv(".../scraper/"+str(now)+".csv",encoding='utf-8')

In [None]:
blocked_keys

## Regression on zed indices

In [None]:
"""
Regression on zed indices to find areas of increasing value. Also adds whether auction or not
"""

list_of_files = glob.glob('.../scraper/*') # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file,index_col=0)
# df = df.transpose()

# Adding columns
df[['5_year_reg','4_year_reg','3_year_reg','2_year_reg','1_year_reg','6_month_reg','3_month_reg']] = df[['zed_index_5year','zed_index_4year','zed_index_3year','zed_index_2year','zed_index_1year','zed_index_6month','zed_index_3month']].apply(pd.to_numeric)
df['average_sale_price-price'] = df['sale_price'].apply(pd.to_numeric)
df['price']=df['price'].apply(pd.to_numeric)
df['average_sale_price-price'] = df['sale_price'].sub(df['price'], axis=0 )

c = 0
drop = 0

for i, row in df.iterrows():
    c += 1
    #print "---- NEW -----"
    print c
    if float(row['zed_index']) == 0.0:
        df.drop(i,inplace=True)
        print "DROP"
        drop += 1
    else:
        if "auction" in row['description']:
            df.loc[i,'auction'] = 1
        Y = np.array([float(row['zed_index_5year']),float(row['zed_index_4year']),float(row['zed_index_3year']),float(row['zed_index_2year']),float(row['zed_index_1year']),float(row['zed_index_6month']),float(row['zed_index_3month']),float(row['zed_index'])])
        X = np.array([0,1,2,3,4,5,5.5,5.75])
        X = np.reshape(X,(8,1))
        Y = np.reshape(Y,(8,1))
        col_names = ['5_year_reg','4_year_reg','3_year_reg','2_year_reg','1_year_reg','6_month_reg','3_month_reg']
        col_names_counter = 0
        model = LinearRegression()
        model.fit(X,Y)
#         if float(row['zed_index']) == 0.0:
#             continue
        df.loc[i,col_names[col_names_counter]] = float(model.coef_[0])/float(row['zed_index'])
        col_names_counter += 1
        for x in range(len(X)- 2):
            X = np.delete(X, 0)
            Y = np.delete(Y, 0)
            X = np.reshape(X,(8-(x+1),1))
            Y = np.reshape(Y,(8-(x+1),1))
            model = LinearRegression()
            model.fit(X,Y)
            df.loc[i,col_names[col_names_counter]] = float(model.coef_[0])/float(row['zed_index'])
            col_names_counter += 1
        df.loc[i,'1_year_reg_rate'] = (row['1_year_reg'] - row['2_year_reg'])
        df.loc[i,'6_month_reg_rate'] = (row['6_month_reg'] - row['1_year_reg'])/2. # Divided by two to regularise for time period of 6 months
        df.loc[i,'3_month_reg_rate'] = (row['3_month_reg'] - row['6_month_reg'])/4. # Divided by four to regularise for time period of 3 months

# Ordering
df = df[['displayable_address','details_url','distance_work_2','distance_work_1','cycling_work_2','agent_name','agent_phone','county','description','first_published_date','last_published_date','latitude','longitude','listing_status','listing_id','num_bathrooms','num_bedrooms','num_recepts','outcode','post_town','property_type','price_put_on_market','page_views_30_days','page_views_all_time','overall_rating','community_and_safety','entertainment_and_nightlife','parks_and_recreation','restaurants_and_shopping','schools_and_public_services','transport_and_travel', 'price', 'rental_price','sale_price', 'average_sale_price-price' ,'zed_index','zed_index_3month','zed_index_6month','zed_index_1year','zed_index_2year','zed_index_3year','zed_index_4year','zed_index_5year','5_year_reg','4_year_reg','3_year_reg','2_year_reg','1_year_reg','6_month_reg','3_month_reg','1_year_reg_rate','6_month_reg_rate','3_month_reg_rate','auction']]
print "Dropped: ",drop
now = datetime.datetime.now().strftime("%H%M%d%m%y")
df.to_csv(".../final/"+str(now)+".csv",encoding='utf-8')