# Zoopla Property Listings API

In [2]:
# All imports

import zoopla # Python wrapper for Zoopla API. Installed through pip.
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.6f' % x,'display.max_columns', None, 'display.max_rows', None)

# To get latest file in directory
import glob
import os

import datetime
from time import sleep
import time

In [5]:
"""
ONLY NEEDS TO BE RUN IF LOADING FROM PREVIOUS FILE
"""


# latest full
list_of_files = glob.glob('.../gmaps/*')
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file,index_col=0)
# df = df.transpose()
df = df.drop(['agent_address','agent_logo','category','country','country_code','image_url','street_name'],axis=1) 

# latest subsection
list_of_files = glob.glob('.../final/*')
latest_file = max(list_of_files, key=os.path.getctime)
d2 = pd.read_csv(latest_file,index_col=0) 

print "df shape: ",df.shape
print "d2 shape: ",d2.shape

# Combines subsection with extra details with full set containing all distances for comparison against new search. Leaves duplicates of subsection, one old without extra details e.g. zed index and one with.
d3 = df.merge(d2, on=df.columns.tolist(),how='outer')
print "original d3 shape: ",d3.shape

# Sort zed index to place filled subsection first, then delete the empty subsection to deduplicate
d3 = d3.sort_values('zed_index')
d3['listing_id'] = d3['listing_id'].astype('int')
d3['listing_id'] = d3['listing_id'].astype('string')
d3 = d3.drop_duplicates(subset='listing_id',keep='first')
print "deduplicated d3 shape: ",d3.shape
d3 = d3.set_index('listing_id',drop=False)
del d3.index.name
now = datetime.datetime.now().strftime("%H%M%d%m%y")
d3.to_csv(".../join/"+str(now)+".csv",encoding='utf-8')

df shape:  (7723, 55)
d2 shape:  (6203, 55)
original d3 shape:  (11074, 55)
deduplicated d3 shape:  (7723, 55)


In [6]:
"""
API keys
"""

current = []  # List to assist in deleting old listings
c=0 # Counter 
api = {'API_1':'...','API_2':'...'}

In [7]:
"""
Search parameters. Needs to be in the form of a radius from a certain point, so here it is set to Waterloo Stations lat/long, with a 20 mile radius
"""

params = {  'maximum_beds': 2,
            'page_size': 100,
            'page_number': 1,
            'listing_status': 'sale',
            'latitude': '51.5032',
            'longitude': '-0.1123',
            'radius': 20,
            'maximum_price': 400000,
            'minimum_price': 250000,
            'order_by': 'age',
            'ordering': 'descending'}

In [10]:
"""
Won't work if > 20,000 results - max limit 10,000 (page num 100 * page size 100) for each API key.
Luckily within my search parameters there are roughly 13,000 results, so I am able to order by ascending with one API key then descending to get the remaining 3,000.
"""


# From scratch
l = {}

# Reads file from previous run

# list_of_files = glob.glob('.../join/*')
# latest_file = max(list_of_files, key=os.path.getctime)
# df = pd.read_csv(latest_file,index_col=0)
# df.index = df.index.astype('string')
# df = df.transpose()
# l = df.to_dict()


class GetOutOfLoop( Exception ):
    pass

try:
    try:
        zoop = zoopla.Zoopla(api_key=api['API_1'])
        search = zoop.property_listings(params)
        while len(search.listing) != 0:
            print "Page number: ",params['page_number']
            print "Result count : ",search.result_count
            print "API_1"
            search = zoop.property_listings(params)
            for i in range(len(search.listing)):
                if search.listing[i].listing_id in current:
                    pass
                else:
                    current.append(search.listing[i].listing_id)
                if search.listing[i].listing_id in l.keys():
                    c+=1
                else:
                    l[search.listing[i].listing_id]=search.listing[i]
            params['page_number']+=1
        else:
            raise GetOutOfLoop
        
    except zoopla.exceptions.ZooplaAPIException: 
        if params['page_number'] < 100:
            print "API_1 Page number ",params['page_number'], " less than expected. API limit reached, waiting until next full hour."
            now = datetime.datetime.now()
            h = int(now.strftime("%H"))+1
            t = now.replace(hour=h,minute=0,second=0)
            wait = t - now
            print "Waiting for ",wait.seconds/60," minutes."
            sleep(wait.seconds)
            pass 
        
        try:
            zoop = zoopla.Zoopla(api_key=api['API_2'])
            params['ordering']='ascending'
            params['page_number']=1
            while len(search.listing) != 0:
                print "Page number: ",params['page_number']
                print "Result count : ",search.result_count
                print "API_2"
                search = zoop.property_listings(params)
                for i in range(len(search.listing)):
                    if search.listing[i].listing_id in current:
                        pass
                    else:
                        current.append(search.listing[i].listing_id)
                    if search.listing[i].listing_id in l.keys():
                        c+=1
                    else:
                        l[search.listing[i].listing_id]=search.listing[i]
                params['page_number']+=1
            else:
                raise GetOutOfLoop
                
        except zoopla.exceptions.ZooplaAPIException:
            if params['page_number'] < 100:
                print "API_2 Page number ",params['page_number'], " less than expected. API limit reached, waiting until next full hour."
                now = datetime.datetime.now()
                h = int(now.strftime("%H"))+1
                t = now.replace(hour=h,minute=0,second=0)
                wait = t - now
                print now
                print "Waiting for ",wait.seconds/60," minutes."
                sleep(wait.seconds)
                pass
            
except GetOutOfLoop:
    print "Reached end."
    pass      

for x in l.keys():
    if x in current:
        pass
    else:
        l.pop(x, None)
    
print "Finished both 10,000 cycles."
print "Added ", search.result_count - c
print "Length l: ",len(l)

# Save dictionary using pandas
now = datetime.datetime.now().strftime("%H%M%d%m%y")
print str(datetime.datetime.now())
df = df.from_dict(l)
df = df.transpose()
df.to_csv(".../Zoopla/"+str(now)+".csv",encoding='utf-8')

Page number:  1
Result count :  14669
API_1
Page number:  2
Result count :  14669
API_1
Page number:  3
Result count :  14669
API_1


In [11]:
"""
Checks difference in new property listings
"""

list_of_files = glob.glob('.../join/*') # Old
latest_file = max(list_of_files, key=os.path.getctime)
df = pd.read_csv(latest_file,index_col=0)
# df = df.transpose()

list_of_files = glob.glob('.../Zoopla/*') # New
latest_file = max(list_of_files, key=os.path.getctime)
d2 = pd.read_csv(latest_file,index_col=0)
# d2 = d2.transpose()

print "df shape: ",df.shape
print "d2 shape: ",d2.shape
dfi = df.index.tolist()
d2i = d2.index.tolist()
dfi = [str(i) for i in dfi]
d2i = [str(i) for i in d2i]
print dfi[:5]
print d2i[:5]
print "Difference: ", len(set(dfi).symmetric_difference(set(d2i)))
new = set(dfi)-set(d2i)
print "Length: new ",len(new)

# print new
old = set(d2i)-set(dfi)
print "Length: old ",len(old)

# print old
now = datetime.datetime.now().strftime("%H%M%d%m%y")
new_df = pd.DataFrame(list(new))
new_df.to_csv(".../new/"+str(now)+".csv",encoding='utf-8',header=False,index=False)
print "Written new list." # This is used to conditionally format spreadsheet with new properties this run.

df shape:  (14263, 54)
d2 shape:  (14402, 61)
['46573687', '46230731', '46297513', '45524046', '46104634']
['11358738', '12518294', '13648928', '13776615', '14124892']
Difference:  3145
Length: new  1503
Length: old  1642
Written new list.
