In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm 
from scipy import stats
from geopy.distance import geodesic
import itertools as it

#web scraping
import json
import requests
import time
import http.client, urllib.parse
import lxml.html as lx
from bs4 import BeautifulSoup

pd.options.mode.chained_assignment = None

# GET DATA

#### Generate houses sample

In [None]:
'''
Generates a sample of up to 500 houses from each city
'cities' is a list of tuples where the first value is the city name and the second value is the state code
Returns a dataframe
'''
def getHouses(cities):

    houses_list = []

    url = "https://realty-mole-property-api.p.rapidapi.com/saleListings"

    for city, state in cities: #for each city, get up to 500 houses for sale
        querystring = {"city":city,"state":state,"limit":"500"}

        headers = {
            "X-RapidAPI-Key": "43686bd243mshe8a0be6f9e0556cp10a1bbjsn58033bf0f546",
            "X-RapidAPI-Host": "realty-mole-property-api.p.rapidapi.com"
        }

        response = requests.request("GET", url, headers=headers, params=querystring)

        for house in response.json(): #get the info for each house
            if type(house) == dict:
                address = house.get('formattedAddress')
                price = house.get('price')
                city = house.get('city')
                state = house.get('state')
                sqf = house.get('squareFootage')
                try:
                    pricePerSqft = price / sqf
                except:
                    pricePerSqft = None
                latitude = house.get('latitude')
                longitude = house.get('longitude')
                
                row = {'Address':address, 'Price':price, 'City':city, 'State':state, 'SquareFootage':sqf, 
                       'PricePerFt': pricePerSqft, 'Latitude':latitude, 'Longitude':longitude}
            
            houses_list.append(row)

    houses = pd.DataFrame.from_dict(houses_list)

    #remove na and duplicates
    houses.dropna(axis=0, inplace=True)
    houses.drop_duplicates(subset=['Address', 'City'], keep='first', inplace=True, ignore_index=True)

    return houses


In [None]:

#GENERATE HOUSES SAMPLE
cities = [('Los Angeles', 'CA'), ('Anaheim','CA'), ('Long Beach', 'CA'), ('Chicago', 'IL'), ('Naperville', 'IL'), ('Elgin', 'IL'),
             ('Dallas', 'TX'), ('Fort Worth', 'TX'), ('Arlington', 'TX'), ('Washington', 'DC'), ('Arlington', 'VA'), ('Alexandria', 'VA')]

houses = getHouses(cities)

houses.to_csv('houses.csv', index=False)

#### Generate landmarks sample

In [None]:
'''
Generates the landmarks from each city and stores them in a dataframe
scrapes data from YellowPages.com
cities is a list of tuples where the first value is the city name and the second value is the state code
places is a list of landmark types
'''

def getPlaces(cities, places):

    #create list of lists to hold all the data
    data = [[],[],[],[],[]]

    for city, state in cities: #for each city and each landmark type, get the yellow pages search results
        for place in places:

            addresses = []
            names = []

            for i in range(1, 3):
                if i == 1:
                    url = "https://www.yellowpages.com/search?search_terms=" + place + "&geo_location_terms=" + city + "%2C+" + state 
                else:
                    url = "https://www.yellowpages.com/search?search_terms=" + place + "&geo_location_terms=" + city + "%2C+" + state + "&page=" + str(i)
                time.sleep(0.05)
                response = requests.get(url,
                                    headers = {"accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
                                                "accept-encoding": "gzip, deflate, br",
                                                "accept-language": "en-US,en;q=0.9",
                                                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"})

                html = response.text
                page = BeautifulSoup(html, "html.parser")

                #get content
                addresses_raw = page.find_all("div", class_="street-address") #a list of all the addresses in the webpage
                names_raw = page.find_all("a", class_="business-name") #a list of all the location/business names
                
                #extract address from html snippet
                for address in addresses_raw:
                    if address == '':
                        continue
                    else:
                        address = str(address).split('>')[1] 
                        address = address.split('<')[0] 
                        addresses.append(address)
                    

                #extract name from html snippet
                for name in names_raw:
                    name = str(name).split('>')[2]
                    name = name.split('<')[0]
                    names.append(name)

                #list of names may contain blank elements at the end, remove them
                while len(names) > len(addresses):
                    del names[-1]
                
                #consolidate all school types 
                #these three search terms were used instead of just "school" because they yielded more relevant results
                if place in ['elementary school', 'middle school', 'high school']:
                    place = 'school'

                #add everything to the list of lists
                data[0].extend([place] * len(names)) #add landmark type
                data[1].extend(names) #add location/business names
                data[2].extend(addresses) #add addresses
                data[3].extend([city] * len(names)) #add city
                data[4].extend([state] * len(names)) #add state 

    #now that we have all the data, turn list of lists into a dataframe
    landmarks = pd.DataFrame(data).T   
    landmarks.columns = ['Landmark', 'Name', 'Address', 'City', 'State']      

    #remove na and duplicates
    landmarks.dropna(axis=0, inplace=True)
    landmarks.drop_duplicates(subset=['Name', 'Address'], keep='first', inplace=True, ignore_index=True)

    return landmarks
    
    
        

In [None]:
#generate sample
cities = [('Los Angeles', 'CA'), ('Anaheim','CA'), ('Long Beach', 'CA'), 
          ('Chicago', 'IL'), ('Naperville', 'IL'), ('Elgin', 'IL'),
          ('Dallas', 'TX'), ('Fort Worth', 'TX'), ('Arlington', 'TX'), 
          ('Washington', 'DC'), ('Arlington', 'VA'), ('Alexandria', 'VA')]

places = ['hospital', 
          'grocery', 
          'park', 
          'beach', 
          'cemetary', 
          'shopping', 
          'restaurant', 
          'golf course', 
          'prison', 
          'worship',
          'elementary school', 
          'middle school', 
          'high school']

landmarks = getPlaces(cities, places)
landmarks.to_csv('landmarks_raw.csv', index=False)

#### Get the landmark coordinates

In [2]:
#GET COORDINATES FOR LANDMARKS 

'''
Converts addresses into gps coordinates.
Takes "landmarks.csv" and writes two new columns ('Latitude' and 'Longitude') containing the coordinates.
'''

def getCoords(df):
    
    #api info
    conn = http.client.HTTPConnection('api.positionstack.com')
    key = '614513ff11a7392f2a8c5c2ed0f88cfa'
    
    '''
    Helper function which sends a query to the api and returns the latitude and longitude as a tuple
    'address' is a string in the form "street address, city, state"
    '''
    def query(address, state):

        #api query parameters
        params = urllib.parse.urlencode(
            {
            'access_key': key,
            'query': address,
            'region_code': state,
            'country': 'US',
            'limit': 1
            })
        
        conn.request('GET', '/v1/forward?{}'.format(params)) #query
        results = conn.getresponse() #json data
        
        #occasionally, the query returns unexpected content
        #the try/except block labels these instances as Nan, making them easy to filter out of a dataframe
        try:
            data = json.loads(results.read())['data'][0] #extracts the info we want from json, 'data' is a dict
            lat = data['latitude']
            lon = data['longitude']
            return (lat, lon)
            
        except:
            return None
    
    #lists to store coordinates
    latitude = []
    longitude = []
    
    #to monitor progress
    counter = 1
    start_time = time.time()

    #call query() for each address and store return values
    for address, city, state in zip(df['Address'], df['City'], df['State']):
        result = query(address + ', ' + city + ', ' + state, state)  
        
        if result is None:
            latitude.append(None)
            longitude.append(None)  
        else:                                                                        
            latitude.append(result[0])
            longitude.append(result[1])

        #display progress
        if (counter % 2000) == 0:
            print(f'progress: {counter}/{len(df)}   time: {time.time() - start_time} secs')
        
        counter += 1
        time.sleep(1) #if queries are sent too quickly, false coordinates will be returned
                      #unfortunately, this necessary delay means the function takes hours to finish

    #add coordinates to dataframe
    df['Latitude'] = latitude
    df['Longitude'] = longitude
    
    return df
    

In [4]:
landmarks = pd.read_csv('landmarks.csv')

landmarks_coords = getCoords(landmarks)

landmarks_coords.to_csv('landmarks_coords.csv', index=False)


progress: 2000/17540   time: 2376.1752648353577 secs
progress: 4000/17540   time: 4746.536860466003 secs
progress: 6000/17540   time: 7117.204186201096 secs
progress: 8000/17540   time: 9484.64375281334 secs
progress: 10000/17540   time: 11842.947708845139 secs
progress: 12000/17540   time: 14202.179571390152 secs
progress: 14000/17540   time: 16543.056074142456 secs
progress: 16000/17540   time: 18892.102096796036 secs


"\nca_coords = getCoords(landmarks_ca)\nca_coords.to_csv('ca_coords.csv', index=False, float_format=f'%.{6}f')\n\ndc_coords = getCoords(landmarks_dc)\ndc_coords.to_csv('dc_coords.csv', index=False, float_format=f'%.{6}f')\n"

#### Find the minimum distance from each house to each landmark type

In [53]:
def test(house, landmark):

    def iterate_house(row):
        landmark_types = ['hospital', 'cemetary', 'park', 'beach', 'shopping', 'grocery', 'restaurant', 'golf course', 'worship', 'school']
        all_dist = []

        coord1 = (row['Latitude'], row['Longitude'])

        min_dist = np.empty(10)
        for place in landmark_types:
            places = landmark[landmark['Landmark']==place]
            all_dist = []

            latitude = places['Latitude'].tolist() 
            longitude = places['Longitude'].tolist()
            
            for lat, lon in zip(latitude, longitude):
                coord2 = (lat, lon)
                curr_dist = geodesic(coord1, coord2).miles
                all_dist.append(curr_dist)

            np.append(min_dist, min(all_dist))

        return min_dist
            
    distances = house.apply(iterate_house, axis=1) 
    return distances

    houses = [(house['Latitude'][i], house['Longitude'][i]) for i in range(len(house['Latitude']))]
    landmarks = [(landmark['Latitude'][i], landmark['Longitude'][i]) for i in range(len(landmark['Latitude']))]

    for house_coord in houses:
        dist = []
        for landmark_coord in landmarks:
            curr_dist = geodesic(house_coord, landmark_coord).miles
            dist.append(curr_dist)
        



In [45]:
house = pd.read_csv('houses.csv', names=['Address', 'Price', 'City', 'State', 'SqFt', 'PricePerSqft', 'Latitude', 'Longitude'])
house = house[house['State']=='TX']
landmark = pd.read_csv('tx_coords.csv', names=['Landmark', 'Name', 'Address', 'City', 'State', 'Latitude', 'Longitude'])
landmark.dropna(inplace=True)
house_sample = house.head(5).reset_index(drop=True)
landmark_sample = landmark.groupby('Landmark').head(5)[1:].reset_index(drop=True)   

In [54]:
distances = test(house_sample, landmark_sample)
distances

0    [1.2120567758953e-311, 1.212056775421e-311, 1....
1    [3.873e-321, 3.937e-320, 3.725e-321, 9.027e-32...
2    [1.2120175286484e-311, 1.2120175288856e-311, 1...
3    [118.262785, 118.277756, 118.283209, 118.23708...
4    [1.212015601587e-311, 1.212015601587e-311, 1.2...
dtype: object