In [360]:
import pandas as pd
from faker import Factory
from faker import Faker
import names
import random
import sys
import math
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import uniform, norm, beta, weibull_min, rv_discrete
from scipy.spatial.distance import cdist, euclidean

from pyzillow.pyzillow import ZillowWrapper, GetDeepSearchResults, GetUpdatedPropertyDetails
import googlemaps
import time, datetime
import geopy.geocoders as gc
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
from selenium import webdriver


%matplotlib inline

In [399]:
napa_data_raw = pd.read_excel('Napa Building Damage Raw Dataset.xlsx', dtype={'Object ID': np.int, 
                'Address' : str, 'Latitude' : np.float32, 
                'Longitude' : np.float32, 'Entry' : datetime.datetime, 'Tag' : str, 
                'Description' : str})

In [400]:
napa_data_raw.head()

Unnamed: 0,Object ID,Address,Latitude,Longitude,Entry,Tag,Description
0,981,1 BURNETTE CT,38.315235,-122.308174,2014-09-05,Yellow,"chimney broken at roof line, avoid side yard u..."
1,809,1 EDITH CT,38.314804,-122.310585,2014-09-23,Yellow,chimney cracked at roofline - do not use firep...
2,2379,1 FIRST ST,38.303623,-122.273598,2014-09-05,Yellow,"separation of chimney bricks, Restriction arou..."
3,1196,1 HUNTINGTON CT,38.304222,-122.323288,2014-09-05,Green,no posting - property owner structural engr
4,1242,1 LUKE DR,38.343555,-122.327278,2014-09-05,Green,structure good


In [401]:
napa_data_hua = pd.read_excel('Napa Building Damage Modified by Henry.xlsx', dtype={
            'Occupancy' : str, 'Address' : str, 
            'Latitude' : np.float32, 'Longitude' : np.float32, 'Permit Issued' : datetime.datetime,
            'Permit Finaled' : datetime.datetime,
            'Year' : datetime.datetime, 'Chimney' : np.int, 'Value' : np.float, 'Area' : np.float,
            'Foundation' : np.int, 'Damage Level' : np.int
                })

In [402]:
napa_data_hua.head()

Unnamed: 0,Address,Occupancy,Area,Year,Value,Chimney,Foundation,Damage Level,Permit Issued,Permit Finaled
0,1 1ST ST,Residential,1800.0,,,0,0,2,NaT,NaT
1,1 BURNETTE CT,Residential,1106.0,1954.0,372800.0,0,0,2,NaT,NaT
2,1 EDITH CT,Residential,2200.0,,,0,0,2,NaT,NaT
3,1 ROSE LN,Residential,1292.0,1951.0,505800.0,0,0,1,2014-11-21,NaT
4,100 LILIENTHAL AVE,Residential,1572.0,1955.0,507700.0,0,0,2,NaT,NaT


In [416]:
napa_data = pd.merge(napa_data_raw, napa_data_hua, on='Address', how='outer')

In [412]:
len(napa_data)

3462

In [405]:
napa_data.head()

Unnamed: 0,Object ID,Address,Latitude,Longitude,Entry,Tag,Description,Occupancy,Area,Year,Value,Chimney,Foundation,Damage Level,Permit Issued,Permit Finaled
0,981.0,1 BURNETTE CT,38.315235,-122.308174,2014-09-05,Yellow,"chimney broken at roof line, avoid side yard u...",Residential,1106.0,1954.0,372800.0,0.0,0.0,2.0,NaT,NaT
1,809.0,1 EDITH CT,38.314804,-122.310585,2014-09-23,Yellow,chimney cracked at roofline - do not use firep...,Residential,2200.0,,,0.0,0.0,2.0,NaT,NaT
2,2379.0,1 FIRST ST,38.303623,-122.273598,2014-09-05,Yellow,"separation of chimney bricks, Restriction arou...",,,,,,,,NaT,NaT
3,1196.0,1 HUNTINGTON CT,38.304222,-122.323288,2014-09-05,Green,no posting - property owner structural engr,,,,,,,,NaT,NaT
4,1242.0,1 LUKE DR,38.343555,-122.327278,2014-09-05,Green,structure good,,,,,,,,NaT,NaT


In [None]:
fake = Faker()

def set_name(x):
    return fake.name()

def set_income(income_series, count_series):
    x = income_series
    p_x = count_series / count_series.sum()
    income_dist = rv_discrete(name='Income', values=(x, p_x))
    return income_dist.rvs()

def set_savings(income):
    if pd.isnull(income): return
    savings_dist = beta(a=2, b=1, loc=5000, scale = 200000)
    max_pdf = savings_dist.pdf(205000)
    inc_pdf = savings_dist.pdf(income)
    savings_rate = 0.25*(inc_pdf / max_pdf)
    return savings_rate * income

def set_house_value(income):
    if ~pd.isnull(income): return
    min_house_price_multiplier = 2
    return income*uniform.rvs(loc = min_house_price_multiplier, scale = 1)

def set_house_area(value):
    if pd.isnum(value): return
    dollar_per_sf = 150
    if int(value / dollar_per_sf) < 500:
        return 500
    else:
        return value / dollar_per_sf

def set_mortgage_payment(value):
    if pd.isnull(value): return
    monthly_rate = 0.05/12
    num_payments = 30*12
    down_payment = 0.1
    loan_value = value - value * down_payment
    return -np.pmt(monthly_rate,num_payments, loan_value)

def set_occupancy(income):
    if income >= 50000:
        return 'Single Family Dwelling'
    elif uniform.rvs(0,1) >= 0.2:
        return 'Single Family Dwelling'
    else:
        return 'Mobile Home'

def set_listing():
    if uniform.rvs(0,1) >= 0.3333:
        return False
    else:
        return True

def set_credit(x):
    return int(uniform.rvs(300, 350))

def set_insurance(income):
    insurance_dist = beta(a=2, b=1, loc=5000, scale = 200000)
    max_pdf = insurance_dist.pdf(205000)
    inc_pdf = insurance_dist.pdf(income)
    if inc_pdf / max_pdf > 0.5:
        return 0.8
    else:
        return 0.0

def set_bedrooms(area):
    if area <= 500:
        return 0
    else:
        bedrooms_pct = 0.3
        avg_sf = 200
        return int((bedrooms_pct * area) / avg_sf ) 

def set_bathrooms(area):
    if area <= 500:
        return 1
    else:
        bathrooms_pct = 0.1
        avg_sf = 100
        return max(int((bathrooms_pct * area) / avg_sf ), 1)

In [61]:
# napa_data['Name'] = np.nan
# napa_data['Credit'] = np.nan
# napa_data['Savings'] = np.nan
# napa_data['Bedrooms'] = napa_data['Area']
# napa_data['Bathrooms'] = napa_data['Area']
# napa_data['Mortgage Payment'] = napa_data['Value']
# napa_data['Listing'] = np.nan
# napa_data['Insurance'] = np.nan


In [None]:
napa_data.loc[napa_data['Value'].isnull(),'Value'] = napa_data['Income'].apply(set_house_value)

In [None]:
# napa_data['Credit'].apply(set_credit);
# napa_data['Name'].apply(set_name);
# napa_data['Savings'].apply(set_savings);

Example of how to assign slices

In [63]:
f = pd.DataFrame({'a':[1,2,3,4,5], 'b':[10,20,30,40,50]})

In [64]:
f.loc[f['a'] <= 3, 'b'] = f.loc[f['a'] <= 3, 'b'] / 10

In [71]:
napa_data.loc[napa_data['Value'].isnull(),'Value'] = napa_data.loc[napa_data['Value'].isnull(),'Chimney']

In [72]:
napa_data.loc[napa_data['Value'].isnull(),'Value']

Series([], Name: Value, dtype: object)

In [None]:
# napa_data['Mortgage Payment'].apply(set_mortgage_payment)

# Geocoding

In [6]:
google_api_key = 'AIzaSyDBXKfUQ5g-vuTM_rbFnUPQvfXKaSBf_pk'
geolocator = gc.GoogleV3(api_key=google_api_key)

In [None]:
street_address = '3525 SILVERADO TRL'
city = "NAPA"
state = 'CA'
full_address = street_address + ', ' + city + ', ' + state

gmaps = googlemaps.Client(key=google_api_key)
geocode_results = gmaps.geocode(full_address)

In [9]:
def get_city(geocode_results):
     if 'address_components' in geocode_results:
        for address_component in geocode_results['address_components']:
            if 'locality' in address_component['types']:
                return address_component['long_name']
            
def get_zip(geocode_results):
     if 'address_components' in geocode_results:
        for address_component in geocode_results['address_components']:
            if 'postal_code' in address_component['types']:
                return address_component['long_name']
                
def get_state(geocode_results):
     if 'address_components' in geocode_results:
        for address_component in geocode_results['address_components']:
            if 'administrative_area_level_1' in address_component['types']:
                return address_component['long_name']

In [None]:
# now map our functions to extract city and state names
# city = get_city(geocode_results[0]) 
zipcode = get_zip(geocode_results[0]) 
state = get_state(geocode_results[0]) 
print(city, ' ', state, ' ', zipcode)

In [417]:
zipcode = []

# napa_data = napa_data[:4]

for index, row in napa_data.iterrows():
    time.sleep(0.1)
    try:
        raw_results = geolocator.geocode(query = row['Address'] + ', NAPA, CA', exactly_one=True).raw
        zipcode.append(get_zip(raw_results))
    except:
        zipcode.append(get_zip('94559'))

In [418]:
import pickle

with open('napa_zipcodes.pkl', 'wb') as f:
    pickle.dump(zipcode, f, pickle.HIGHEST_PROTOCOL)
    
with open('napa_zipcodes.pkl', 'rb') as f:
    zipcode = pickle.load(f)

In [419]:
napa_data.loc[:, 'Zipcode'] = zipcode

In [448]:
import pickle

with open('napa_data_w_zip.pkl', 'wb') as f:
    pickle.dump(napa_data, f, pickle.HIGHEST_PROTOCOL)
    
# with open('napa_data_w_zip.pkl', 'rb') as f:
#     napa_data = pickle.load(f)

In [431]:
len(napa_data) / 4

865.5

In [420]:
napa_data.head()

Unnamed: 0,Object ID,Address,Latitude,Longitude,Entry,Tag,Description,Occupancy,Area,Year,Value,Chimney,Foundation,Damage Level,Permit Issued,Permit Finaled,Zipcode
0,981.0,1 BURNETTE CT,38.315235,-122.308174,2014-09-05,Yellow,"chimney broken at roof line, avoid side yard u...",Residential,1106.0,1954.0,372800.0,0.0,0.0,2.0,NaT,NaT,94558
1,809.0,1 EDITH CT,38.314804,-122.310585,2014-09-23,Yellow,chimney cracked at roofline - do not use firep...,Residential,2200.0,,,0.0,0.0,2.0,NaT,NaT,94558
2,2379.0,1 FIRST ST,38.303623,-122.273598,2014-09-05,Yellow,"separation of chimney bricks, Restriction arou...",,,,,,,,NaT,NaT,94559
3,1196.0,1 HUNTINGTON CT,38.304222,-122.323288,2014-09-05,Green,no posting - property owner structural engr,,,,,,,,NaT,NaT,94558
4,1242.0,1 LUKE DR,38.343555,-122.327278,2014-09-05,Green,structure good,,,,,,,,NaT,NaT,94558


In [445]:
napa_data_slice1 = napa_data[:866]
napa_data_slice2 = napa_data[866:1732]
napa_data_slice3 = napa_data[1732:2598]
napa_data_slice4 = napa_data[2598:3462]

# Zillow Data

In [246]:
zillow_api_key = 'X1-ZWz1fau70lqih7_1pt0f'
zillow_data = ZillowWrapper(zillow_api_key)

In [337]:
deep_outputs = []

home_type = []
home_size = []
year_built = []
bathrooms = []
bedrooms = []
zestimate = []
long = []
lat = []
tax_value = []


for index, row in napa_data_slice1.iterrows():
    address = row['Address']
    zipcode = row['Zipcode']
    
    print(address, ' ', zipcode)
    
    try:
        time.sleep(0.1)
        deep_search_response = zillow_data.get_deep_search_results(address, zipcode)
        deep_output = GetDeepSearchResults(deep_search_response)
        
        home_type.append(deep_output.home_type)
        home_size.append(deep_output.home_size)
        year_built.append(deep_output.year_built)
        bathrooms.append(deep_output.bathrooms)
        bedrooms.append(deep_output.bedrooms)
        zestimate.append(deep_output.zestimate_amount)
        long.append(deep_output.longitude)
        lat.append(deep_output.latitude)
        tax_value.append(deep_output.tax_value)
    
    except Exception as e:
        print('Crap')
        home_type.append(np.nan)
        home_size.append(np.nan)
        year_built.append(np.nan)
        bathrooms.append(np.nan)
        bedrooms.append(np.nan)
        zestimate.append(np.nan)
        long.append(np.nan)
        lat.append(np.nan)
        tax_value.append(np.nan)

napa_data_slice1['Home Type'] = home_type
napa_data_slice1['Home Size'] = home_size
napa_data_slice1['Year Built'] = year_built
napa_data_slice1['Bathrooms'] = bathrooms
napa_data_slice1['Bedrooms'] = bedrooms
napa_data_slice1['Zestimate'] = zestimate
napa_data_slice1['Longitude'] = long
napa_data_slice1['Latitude'] = lat
napa_data_slice1['Tax Value'] = tax_value

987 LIBERTY DR   94559
986 KAISER RD   94558
Crap
980 LIBERTY DR   94559
980 LIBERTY DR   94559


In [329]:
deep_outputs[2].bedrooms

'3'

In [331]:
for i in deep_outputs:
    print(i.home_type)

SingleFamily
SingleFamily
SingleFamily


In [317]:
home_type

[nan, 'SingleFamily', 'SingleFamily', 'SingleFamily']

In [311]:
d.tax_value

'585000.0'

In [195]:
zillow_data.get_deep_search_results('620 N 34TH ST', '98105')

<Element '{http://www.zillow.com/static/xsd/SearchResults.xsd}searchresults' at 0x1a156f6048>

In [289]:
# address = '10 ENTERPRISE CT'
address = '987 LIBERTY DR'
zipcode = '94559'

In [290]:
try:
    z = GetDeepSearchResults(zillow_data.get_deep_search_results(address, zipcode))
except:
    print('Fuck')

In [291]:
z.tax_value

'647700.0'

In [218]:
z.bathrooms
z.bedrooms
z.home_size
z.home_type
z.latitude
z.longitude
z.property_size
z.tax_value
z.year_built
z.zestimate_amount

'2.5'

SqFt
{'bedrooms': 'result/bedrooms', 'home_detail_link': 'result/links/homedetails', 'year_built': 'result/yearBuilt', 'zestimate_percentile': 'result/zestimate/percentile', 'longitude': 'result/address/longitude', 'home_size': 'result/finishedSqFt', 'bathrooms': 'result/bathrooms', 'zestimate_last_updated': 'result/zestimate/last-updated', 'zestimate_valuation_range_high': 'result/zestimate/valuationRange/high', 'property_size': 'result/lotSizeSqFt', 'last_sold_date': 'result/lastSoldDate', 'map_this_home_link': 'result/links/mapthishome', 'home_type': 'result/useCode', 'zillow_id': 'result/zpid', 'last_sold_price': 'result/lastSoldPrice', 'graph_data_link': 'result/links/graphsanddata', 'tax_value': 'result/taxAssessment', 'latitude': 'result/address/latitude', 'zestimate_value_change': 'result/zestimate/valueChange', 'zestimate_amount': 'result/zestimate/amount', 'tax_year': 'result/taxAssessmentYear', 'zestimate_valuationRange_low': 'result/zestimate/valuationRange/low'}
2.5
4
<Ele