In [14]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import xmltodict
import datetime
from api_keys import a_keys
from api_keys import b_keys 
from api_keys import c_keys 
from api_keys import d_keys 
from api_keys import e_keys 
from api_keys import f_keys 
from api_keys import g_keys
from api_keys import h_keys
from api_keys import i_keys

ModuleNotFoundError: No module named 'api_keys'

In [None]:
# Check if the row returned from Zillow API contains both 'rentzestimate' and 'lastSoldPrice' information
def check_data(address):
    return (address.get('rentzestimate')) and (address.get('lastSoldPrice'))

In [None]:
# Check if the row returned from Zillow API contains 'rentzestimate' but 'lastSoldPrice' is blank
def need_rent_estimate(address):
    return (address.get('rentzestimate')) and not(address.get('lastSoldPrice'))

In [None]:
# Check if the row returned from Zillow API contains 'lastSoldPrice' but 'rentzestimate' is blank
def need_sale_estimate(address):
    return (not(address.get('rentzestimate')) and (address.get('lastSoldPrice')))

In [None]:
# If 'check_data' is true, execute this function to store data from Zillow API
def get_estimate(address, zipcode):
    print('Entering get_estimate')
    rent = address['rentzestimate']['amount']['#text']
    sold = address['lastSoldPrice']['#text']
    sold_year = datetime.datetime.strptime(address['lastSoldDate'], '%m/%d/%Y').date().year
    sqft = address['finishedSqFt']
    address = address['address']['street']
    var = 0
    return pd.Series({'address':address, 'zipcode':zipcode, 'sold':sold, 'rent':rent, 'sold year':sold_year, 'sqft':sqft, 'var':var})     

In [None]:
# If 'get_rent_estimate' is true, execute this function to store data from Zillow API
def get_rent_estimate(address, zipcode):
    print('Entering get_rent_estimate')
    rent = address['rentzestimate']['amount']['#text']
    address = address['address']['street']
    var = 2
    return pd.Series({'address':address, 'zipcode':zipcode, 'rent':rent, 'var':var})

In [None]:
# If 'get_property_estimate' is true, execute this function to store data from Zillow API
def get_property_estimate(address, zipcode):
    print('Entering get_property_estimate')
    sold = address['lastSoldPrice']['#text']
    sold_year = datetime.datetime.strptime(address['lastSoldDate'], '%m/%d/%Y').date().year
    sqft = address['finishedSqFt']
    address = address['address']['street']
    var = 1
    return pd.Series({'address':address, 'zipcode':zipcode, 'sold':sold, 'sold year':sold_year, 'sqft':sqft, 'var':var}) 

In [None]:
# Function to determine which key to use during Zillow API query process. According to doc, there's a limitation of
# 1000 calls per key per day
def get_key(counter):
    if counter <= 999:
        key = a_keys
    elif counter <= 1998:
        key = b_keys
    elif counter <= 2997:
        key = c_keys
    elif counter <= 3996:
        key = d_keys
    elif counter <= 4995:
        key = e_keys
    elif counter <= 5994:
        key = f_keys
    elif counter <= 6993:
        key = g_keys
    elif counter <= 7992:
        key = h_keys
    else:
        key = i_keys
    return key

In [4]:
# Read NY Finance Department file
ny18_df = pd.read_csv('Files/2018NYSales.csv')

# Check the amount of rows
len(ny18_df)

# Removing leading blank space from specific column
ny18_df = ny18_df.rename(columns={' SALE PRICE ': 'SALE PRICE'})

In [6]:
# Display all columns from file
ny18_df.columns

Index(['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY',
       'TAX CLASS AT PRESENT', 'BLOCK', 'LOT', 'EASE-MENT',
       'BUILDING CLASS AT PRESENT', 'ADDRESS', 'APARTMENT NUMBER', 'ZIP CODE',
       'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS',
       'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'YEAR BUILT',
       'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE',
       'SALE PRICE', 'SALE DATE'],
      dtype='object')

In [8]:
# Generate descriptive statistics
ny18_df.describe()

Unnamed: 0,BOROUGH,BLOCK,LOT,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,YEAR BUILT,TAX CLASS AT TIME OF SALE
count,16369.0,16369.0,16369.0,16369.0,16369.0,16369.0,16369.0,16369.0,16369.0
mean,1.0,1134.10416,736.36899,10015.058281,1.837681,0.341438,2.305761,1702.245586,2.120716
std,0.0,526.607997,910.316535,385.473717,13.408085,4.091756,14.379706,652.533099,0.5415
min,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,798.0,29.0,10013.0,0.0,0.0,0.0,1910.0,2.0
50%,1.0,1171.0,1002.0,10022.0,0.0,0.0,1.0,1931.0,2.0
75%,1.0,1469.0,1185.0,10028.0,1.0,0.0,1.0,1973.0,2.0
max,1.0,2250.0,9101.0,10463.0,597.0,180.0,601.0,2018.0,4.0


In [12]:
# Prepare Address data to use into Zillow API
ny18_df['ADDRESS'] = ny18_df['ADDRESS'].str.strip()
ny18_df['ADDRESS'] = ny18_df['ADDRESS'].str.split(' ').apply(lambda x: '+'.join(x))

# Clean Sale Price data
ny18_df['SALE PRICE'] = ny18_df.loc[:,'SALE PRICE'].replace('[\$,]', '', regex=True)
ny18_df['SALE PRICE'] = ny18_df.loc[:,'SALE PRICE'].replace('[-,]', '0', regex=True)

# Removing home prices less than 100K to avoid a bias after preliminary research on what these lower sale prices rep.
# like: Internal family sale transfer at $0, garage sale etc.
ny18_df = ny18_df[ny18_df['SALE PRICE'].astype(float) > 100000]

# Removing duplicated addresses
ny18_df = ny18_df.drop_duplicates(subset=['ADDRESS'], keep = 'first')

In [13]:
# Display unique addresses that will be used into Zillow API
len(ny18_df)

8951

In [None]:
# Variables to be used during Zillow query process
series = []
curdoc = {}
counter = 0

# Loop through Address and Zip Code from NY spreadsheet data
for address, zipcode in zip(ny18_df['ADDRESS'],ny18_df['ZIP CODE']):
    # Count how many records were processed
    counter +=1
    # Call get_key function to return the API Key we'll use
    api_key = get_key(counter)
    # Create Zillow API call and return the data
    try:
       i = 0
       print(f"https://www.zillow.com/webservice/GetDeepSearchResults.htm?zws-id={api_key}&address={address}&citystatezip={zipcode}&rentzestimate=true counter={counter}")
       response_main = requests.get(f"https://www.zillow.com/webservice/GetDeepSearchResults.htm?zws-id={api_key}&address={address}&citystatezip={zipcode}&rentzestimate=true")
       doc = xmltodict.parse(response_main.text, force_list={'result': True})['SearchResults:searchresults']['response']['results']['result']

       # Loop through each element returned from the API call
       while i < len(doc):
           print(f"Record {i}: {doc[i]['address']['street']}")
           if check_data(doc[i]):
               new_series = get_estimate(doc[i], zipcode)
               series.append(new_series)
           elif need_rent_estimate(doc[i]):
               new_series = get_rent_estimate(doc[i], zipcode)
               series.append(new_series)
           elif need_sale_estimate(doc[i]):
               new_series = get_property_estimate(doc[i], zipcode)
               series.append(new_series)
           else:
               print("NO ESTIMATED FUNCTIONS ENTERED!!")
           # Variable we created to determine how many rows were processed into our loop       
           i+=1
                 
           print(f'End of record {i}')
           print('-------------------------------\n')
       # Sleep required due to Socket Error even though we didn't reach 1000 calls per API Key     
       time.sleep(1)
       df = pd.concat(series, 1,sort=True).T
    except Exception as e: print(e)

In [None]:
# Save all data gathered from Zillow API into a csv file
df.to_csv("OutputData.csv")