#### Imports

In [12]:
import datetime
import pandas as pd
import json

## Global Constants

In [1]:
DATA_FILENAME = "data.json"

## Helper Functions

In [119]:
def extract_relevant_data(full_zillow_data):
    """
    Argument: `full_zillow_data` is a JSON dictionary from the Zillow data of a sold house.
    """
    #house_details = full_zillow_data["cat1"]["searchResults"]["listResults"][0]
    house_details = full_zillow_data["data"]["property"]
    relevant_data = {}
    relevant_data["id"] = int(house_details["zpid"])
    relevant_data["price"] = float(house_details["lastSoldPrice"])
    relevant_data["address"] = house_details["abbreviatedAddress"]
    relevant_data["location"] = house_details["formattedChip"]["location"][1]["fullValue"]
    relevant_data["beds"] = float(house_details["bedrooms"])
    relevant_data["baths"] = float(house_details["bathrooms"])
    relevant_data["acres"] = float(house_details["lotAreaValue"])
    if house_details["lotAreaUnits"] == "Square Feet":
        # Convert sqft to acres if needed
        relevant_data["acres"] /= 43560.  
    relevant_data["sqft"] = int(house_details["livingAreaValue"])
    relevant_data["year"] = int(house_details["yearBuilt"])
    relevant_data["lat"] = float(house_details['latitude'])
    relevant_data["lon"] = float(house_details['longitude'])
    relevant_data["realtor"] = house_details["brokerageName"]
    relevant_data["date_sold"] = int(house_details["dateSold"] / 1000)
    
    return relevant_data

def add_entry(json_string, ids):
    """
    Argument: `json_string` should be a non-pretty-printed raw JSON string copied from the
              Network > Response section of the web-browser's inspection window. It could be
              named 'async-create-page-state' or '?extensions=%7B...'.
    """
    full_house_data = json.loads(json_string, strict=False)  # crashes when strict=True (default)
    relevant_data = extract_relevant_data(full_house_data)

    # If the id is a duplicate, don't write it to the file
    if relevant_data["id"] in ids:
        print("id %d is a duplicate" % (relevant_data["id"]))
        return
    f = open(DATA_FILENAME, 'a')
    f.write(json.dumps(relevant_data) + "\n")
    f.close()

def load_present_ids():
    ids = set()
    f = open(DATA_FILENAME, 'r')
    for line in f:
        house_data = json.loads(line)
        ids.add(house_data["id"])
    f.close()
    return ids

# Add a New Entry Here

In [121]:
ids = load_present_ids()
# Replace the contents of raw_json.txt
f = open("raw_json.txt", 'r')
raw_json = f.read()
f.close()

# Uncomment this code to the relevant json
#js = json.loads(raw_json)["data"]["property"]
#for key in js:
#    if not "Photo" in key and not "View" in key and not "Map" in key and not "Json" in key and not "nearby" in key and not "Valuation" in key:
#        print(key, js[key])

add_entry(raw_json, ids)

## Inspect the Data

In [127]:
f = open(DATA_FILENAME, 'r')
all_data = {}
for line in f:
    house_data = json.loads(line)
    for key in house_data:
        if key != "date_sold":
            value = house_data[key]
        else:
            value = datetime.datetime.fromtimestamp(house_data[key])
            print(value)
        try:
            all_data[key].append(house_data[key])
        except KeyError:
            all_data[key] = [house_data[key]]
df = pd.DataFrame.from_dict(all_data)
df

2023-09-07 20:00:00
2023-08-23 20:00:00
2022-09-07 20:00:00


Unnamed: 0,id,price,address,location,beds,baths,sqft,year,lat,lon,realtor,acres,date_sold
0,31765002,280000.0,4202 Henneberry Rd,"Manlius, NY 13104",3.0,2.0,1616,1977,42.975567,-76.0025,Howard Hanna Real Estate,1.24,1694131200
1,31728477,201000.0,4515 Apulia Rd,"Jamesville, NY 13078",4.0,3.0,2100,1971,42.990322,-76.07226,Bell Home Team,0.57,1692835200
2,61998985,400000.0,8195 Trellis Brook Ln,"Liverpool, NY 13090",4.0,3.0,2552,2003,43.169643,-76.21038,NextHome CNY Realty,0.229,1662595200
