In [4]:
import os

mypath = os.path.join('zillow-data')
json_files = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]

In [5]:
import json
import pandas as pd


# Utility function to convert currency string to float.
def value_to_float(x):
    
    # Remove the $ and , characters from x.
    x = x.replace('$', '')
    x = x.replace(',', '')
        
    # Handle thousands.
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    
    # Handle millions.
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0
    
    return float(x)

# Initialize a list to keep track of all homes.
all_homes = []

for fn in json_files:
    
    home_info = None
    
    try:
        f = open(fn)
        home_info = json.load(f)
    except Exception as e:
        print (f'Exception {e} encountered!')
        
    current_homes = home_info['cat1']['searchResults']['mapResults']
    
    for home in current_homes:
        
        # If the price field is not filled in, continue.
        if 'price' not in home or home['price'] == '':
            continue
        
        # If the hdpData field does not exist, continue.
        if 'hdpData' not in home:
            continue
        
        h = home['hdpData']['homeInfo'].copy()
        h['price'] = value_to_float(home['price'])
        
        # Parse the address.
        h['addr'] = home['detailUrl'].split('/')[2].replace('-', ' ').lower()
    
        # Insert this home in all homes.
        all_homes.append(h)

# Load in the dataframe.
sold_homes_df = pd.DataFrame(all_homes)

sold_homes_df = sold_homes_df.drop(['providerListingID', 
                                    'newConstructionType',
                                    'unit',
                                    'videoCount',
                                    'contingentListingType',
                                    'isRentalWithBasePrice',
                                    'datePriceChanged',
                                    'priceReduction',
                                    'priceChange',
                                    'openHouse',
                                    'open_house_info',
                                    'group_type',
                                    'grouping_name',
                                    'priceSuffix'],
                                  axis=1)

# We do not need some columns and do not care if they contain NaN values. Drop those columns.

pd.set_option('display.max_columns', None)
print (f'All sold homes: {sold_homes_df.shape}')
sold_homes_df.head(10)

All sold homes: (15818, 32)


Unnamed: 0,zpid,zipcode,city,state,latitude,longitude,price,dateSold,bathrooms,bedrooms,livingArea,homeType,homeStatus,daysOnZillow,isFeatured,shouldHighlight,zestimate,rentZestimate,listing_sub_type,isUnmappable,isPreforeclosureAuction,homeStatusForHDP,priceForHDP,isNonOwnerOccupied,isPremierBuilder,isZillowOwned,currency,country,taxAssessedValue,lotAreaValue,lotAreaUnit,addr
0,37151520,20854,Potomac,MD,39.043986,-77.256456,800000.0,1625641200000,4.0,5.0,3650.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,809700.0,4250.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,1081300.0,5.1,acres,12400 beall mountain ln potomac md 20854
1,246554034,20814,Bethesda,MD,39.022005,-77.107586,800000.0,1617606000000,1.0,2.0,1400.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,636600.0,2499.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,507900.0,0.25,acres,10155 laureate way bethesda md 20814
2,37181535,20816,Bethesda,MD,38.958795,-77.11189,800000.0,1624604400000,3.0,4.0,2390.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,812000.0,4364.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,721567.0,8750.0,sqft,5516 massachusetts ave bethesda md 20816
3,37182564,20817,Bethesda,MD,38.974071,-77.131494,800000.0,1604563200000,3.0,4.0,1826.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,867900.0,4058.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,697767.0,9148.0,sqft,6504 marjory ln bethesda md 20817
4,37096069,20854,Potomac,MD,39.041634,-77.172128,800000.0,1620370800000,3.0,4.0,2810.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,815700.0,3249.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,611000.0,9811.0,sqft,11502 regency dr potomac md 20854
5,37172594,20817,Bethesda,MD,38.999271,-77.11395,800000.0,1627282800000,5.0,5.0,2700.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,806500.0,5269.0,{'is_newHome': True},False,False,FOR_SALE,1625000.0,True,True,False,USD,USA,801500.0,6900.0,sqft,5616 southwick st bethesda md 20817
6,37286594,20910,Silver Spring,MD,39.003488,-77.038573,800000.0,1613980800000,4.0,3.0,2304.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,842700.0,3270.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,534800.0,6570.0,sqft,1701 highland dr silver spring md 20910
7,37282676,20910,Silver Spring,MD,38.998174,-77.016036,800000.0,1610092800000,3.0,4.0,1900.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,848800.0,2856.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,617567.0,6604.0,sqft,505 deerfield ave silver spring md 20910
8,37173681,20815,Chevy Chase,MD,39.0,-77.076248,800000.0,1609142400000,2.0,5.0,2115.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,858700.0,3652.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,658567.0,0.379293,acres,8801 kensington pkwy chevy chase md 20815
9,37181682,20815,Chevy Chase,MD,38.980442,-77.063683,800000.0,1616050800000,3.0,3.0,1913.0,SINGLE_FAMILY,RECENTLY_SOLD,-1,False,False,836000.0,3599.0,{},False,False,RECENTLY_SOLD,800000.0,True,False,False,USD,USA,715233.0,8679.0,sqft,3200 winnett rd chevy chase md 20815


In [6]:
pd.reset_option('display.max_columns', None)

In [7]:
# Drop null rows.
sold_homes_df.dropna()

print (f'All sold homes after dropping null values: {sold_homes_df.shape}')
sold_homes_df.head(10)

All sold homes after dropping null values: (15818, 32)


Unnamed: 0,zpid,zipcode,city,state,latitude,longitude,price,dateSold,bathrooms,bedrooms,...,priceForHDP,isNonOwnerOccupied,isPremierBuilder,isZillowOwned,currency,country,taxAssessedValue,lotAreaValue,lotAreaUnit,addr
0,37151520,20854,Potomac,MD,39.043986,-77.256456,800000.0,1625641200000,4.0,5.0,...,800000.0,True,False,False,USD,USA,1081300.0,5.1,acres,12400 beall mountain ln potomac md 20854
1,246554034,20814,Bethesda,MD,39.022005,-77.107586,800000.0,1617606000000,1.0,2.0,...,800000.0,True,False,False,USD,USA,507900.0,0.25,acres,10155 laureate way bethesda md 20814
2,37181535,20816,Bethesda,MD,38.958795,-77.11189,800000.0,1624604400000,3.0,4.0,...,800000.0,True,False,False,USD,USA,721567.0,8750.0,sqft,5516 massachusetts ave bethesda md 20816
3,37182564,20817,Bethesda,MD,38.974071,-77.131494,800000.0,1604563200000,3.0,4.0,...,800000.0,True,False,False,USD,USA,697767.0,9148.0,sqft,6504 marjory ln bethesda md 20817
4,37096069,20854,Potomac,MD,39.041634,-77.172128,800000.0,1620370800000,3.0,4.0,...,800000.0,True,False,False,USD,USA,611000.0,9811.0,sqft,11502 regency dr potomac md 20854
5,37172594,20817,Bethesda,MD,38.999271,-77.11395,800000.0,1627282800000,5.0,5.0,...,1625000.0,True,True,False,USD,USA,801500.0,6900.0,sqft,5616 southwick st bethesda md 20817
6,37286594,20910,Silver Spring,MD,39.003488,-77.038573,800000.0,1613980800000,4.0,3.0,...,800000.0,True,False,False,USD,USA,534800.0,6570.0,sqft,1701 highland dr silver spring md 20910
7,37282676,20910,Silver Spring,MD,38.998174,-77.016036,800000.0,1610092800000,3.0,4.0,...,800000.0,True,False,False,USD,USA,617567.0,6604.0,sqft,505 deerfield ave silver spring md 20910
8,37173681,20815,Chevy Chase,MD,39.0,-77.076248,800000.0,1609142400000,2.0,5.0,...,800000.0,True,False,False,USD,USA,658567.0,0.379293,acres,8801 kensington pkwy chevy chase md 20815
9,37181682,20815,Chevy Chase,MD,38.980442,-77.063683,800000.0,1616050800000,3.0,3.0,...,800000.0,True,False,False,USD,USA,715233.0,8679.0,sqft,3200 winnett rd chevy chase md 20815


In [9]:
sold_homes_df.to_csv(os.path.join('sale_price.csv'), index=False)