## Shelter Location Data Scrape

### Imports

In [1]:
import os
import glob
import numpy as np
from bs4 import BeautifulSoup
from lxml import html
import lxml
import re
import time
import pandas as pd
from selenium import webdriver
import requests
import geopandas
import geopandas.tools
from shapely.geometry import Point
import pickle

def find_hrefs_by_regex(bs, reg_exp):
    all_items = bs.find_all("a", href=re.compile(reg_exp))
    return all_items

### Scrape website

In [2]:
# set up browser
chrome_driver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chrome_driver
driver = webdriver.Chrome(chrome_driver)
base_url = 'http://www.shelterlistings.org/city/san_francisco-ca.html'
driver.get(base_url)

In [3]:
bs = BeautifulSoup(driver.page_source, 'lxml')

In [4]:
list_of_urls = find_hrefs_by_regex(bs, '/details/')
set_of_urls = set()
for url in list_of_urls:
    set_of_urls.add(url['href'])

In [5]:
addresses = []

for url in set_of_urls:
    driver.get(url)
    time.sleep(np.random.randint(low=4, high=7))
    driver.find_element_by_xpath('//button[@id="seeaddress"]').click()
    time.sleep(2)
    bs = BeautifulSoup(driver.page_source, 'lxml')
    try:
        address_text = str(bs.find(id='address').text).strip()
        addresses.append(address_text)
    except:
        address_text = None

In [6]:
driver.quit()

### Process Scraped Data

In [7]:
addresses

['4651 Mission St',
 '85 Second Street',
 '940 Washington Street',
 '1735 Mission St',
 '2100 Webster St #100',
 '201 8th Street',
 '703 Market Street',
 '1175 Howard St.',
 '180 Howard Street',
 '763 Jerrold Ave',
 '3543 18th St',
 '1021 Mission Street',
 '150 Golden Gate Ave.',
 '474 Valencia St',
 '172 6th Street',
 '260 Golden Gate Avenue',
 '350 Golden Gate Ave',
 '899 Guerrero Street',
 '1175 Howard St.',
 '49 Powell Street',
 '489 Clementina Street',
 '1815 Egbert Avenue',
 '146 Leavenworth St.',
 '965 Mission Street',
 '126 Hyde St',
 '1663 Mission Street, Suite 225',
 '125 Gilbert St',
 '525 5th Street',
 '201 8th St.',
 'C/o Marc Gold',
 '720 Market St Ste 500',
 '1167 Mission Street, 4th Floor',
 '938 Valencia St',
 '890 Hayes Street',
 '149 - 9th Street',
 '900 Pennsylvania Ave',
 '833 Market St',
 '701 Sutter Street, Suite 2 (administrative Office)',
 '260 Golden Gate Avenue',
 '164 6th Street',
 '2500 18th St',
 '164 6th Street',
 '146 Leavenworth Street',
 '350 Golden Ga

In [15]:
addresses_set = set(addresses)
address_df = pd.DataFrame(list(addresses_set))

In [16]:
address_df.columns = ['street']

In [20]:
address_df.drop([129], inplace=True)

In [23]:
address_df.drop([0, 123, 17], inplace=True)

In [27]:
address_df['street_formatted'] = address_df['street'].apply(lambda x: str(x).replace(' ','+')  + ',San+Francisco,+CA')
address_df

Unnamed: 0,street,street_formatted
1,1249 Scott Street,"1249+Scott+Street,San+Francisco,+CA"
2,180 Howard Street,"180+Howard+Street,San+Francisco,+CA"
3,333 7th Street,"333+7th+Street,San+Francisco,+CA"
4,1815 Egbert Avenue,"1815+Egbert+Avenue,San+Francisco,+CA"
6,1275 Mission Street,"1275+Mission+Street,San+Francisco,+CA"
7,"730 Polk St., Fl. 3","730+Polk+St.,+Fl.+3,San+Francisco,+CA"
8,1899 Mission Street,"1899+Mission+Street,San+Francisco,+CA"
9,100 Masonic Avenue,"100+Masonic+Avenue,San+Francisco,+CA"
10,1021 Mission Street,"1021+Mission+Street,San+Francisco,+CA"
11,4439 Third St,"4439+Third+St,San+Francisco,+CA"


In [30]:
def get_lat_lng(input_addr):
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}'.format(input_addr)
    r = requests.get(url)
    r_json = r.json()
    try:
        lat = r_json['results'][0]['geometry']['location']['lat']
    except:
        lat = np.nan
    try:
        lng = r_json['results'][0]['geometry']['location']['lng']
    except:
        lng = np.nan
    return Point(lng, lat)

address_df['geometry'] = address_df['street_formatted'].apply(get_lat_lng)

In [49]:
address_df['geo_text'] = address_df['geometry'].apply(lambda x: str(x))
len(address_df)

133

In [52]:
address_df = address_df.drop_duplicates(['geo_text'], keep='first')

In [54]:
shelter_locations = geopandas.GeoDataFrame(address_df['geometry'], 
                                          geometry='geometry')

In [57]:
shelter_locations = shelter_locations.drop([20])

In [58]:
shelter_locations

Unnamed: 0,geometry
1,POINT (-122.4375855 37.781571)
2,POINT (-122.3933527 37.7910388)
3,POINT (-122.4072079 37.7765976)
4,POINT (-122.3990467 37.7265131)
6,POINT (-122.4141134 37.7764266)
7,POINT (-122.419095 37.7838224)
8,POINT (-122.4196788 37.7668046)
9,POINT (-122.4471199 37.7806658)
10,POINT (-122.409086 37.7804935)
11,POINT (-122.3894788 37.7373504)


### Save Data

In [59]:
pickle.dump(shelter_locations, open('shelter_locations.pkl', 'wb'))