In [132]:
import pandas as pd
import numpy as np
import requests
import json
import re
import time
import dill

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from api_keys import GOOGLE_API_KEY
from csv_pkl_sql import save_it, pkl_it

pd.options.mode.chained_assignment = None

# Get latitude and longitude for locations

In [8]:
with open('../pkl/00_cleaned_city_names.pkl', 'r') as fh:
    location_key = dill.load(fh)

In [9]:
location_key.head(2)

Unnamed: 0,location,location_type,country,province,county,city
0,Argentina-Buenos_Aires,province,Argentina,Buenos Aires,,
1,Argentina-CABA,province,Argentina,Ciudad de Buenos Aires,,


## First try Google Maps API

In [10]:
def get_latitude_longitude(df_row):
    subtype = df_row.location_type
    second_str = df_row[subtype].replace(' ', '+')
    country_str = df_row.country.replace(' ', '+')
    
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address=+{},+{}&key={}'.format(second_str, 
                                                                                            country_str, 
                                                                                            GOOGLE_API_KEY)
    
    try:
        response = requests.get(url).text
        lat_lng = json.loads(response)['results'][0]['geometry']['location']
        lat_lng_df = pd.Series({'lat':lat_lng['lat'], 'lng':lat_lng['lng']})
    except:
        lat_lng_df = pd.Series({'lat':np.NaN, 'lng':np.NaN})
        
    time.sleep(1)
    return lat_lng_df

In [11]:
location_key[['latitude','longitude']] = location_key.apply(lambda x: get_latitude_longitude(x), axis=1)

## Then scrape Google's first search hit
We tried to play nice. Now bring out the big guns for the non-matches.

In [129]:
def google_search_scrape(df_row, driver_list=driver_list):
    
    # If both are floats, it's because they're NaNs--thus we need to scrape
    if isinstance(df_row.latitude_s, float) & isinstance(df_row.longitude_s, float):
    
        # Randomized driver selected from a list where each is running through
        # different ports
        driver = np.random.choice(driver_list)

        subtype = df_row.location_type
        second_str = df_row[subtype].replace(' ', '+')
        country_str = df_row.country.replace(' ', '+')
        total_str = country_str + '+' + second_str

        url = 'https://www.google.com/search?client=safari&rls=en&q={}&ie=UTF-8&oe=UTF-8#q={}+latitude+longitude'.format(total_str, total_str)
        try:
            driver.get(url)
            time.sleep(3)
            text = driver.find_element_by_class_name('kp-header').text
            time.sleep(5)
            text_array = re.split(r""", """, text)
        except:
            print(total_str)
            text_array = [np.NaN, np.NaN]
            
    else:
        # Alread have this data so return existing values
        text_array = [df_row.latitude_s, df_row.longitude_s]
        
    return pd.Series({'latitude_s':text_array[0], 'longitude_s':text_array[1]})

In [97]:
location_key_scrape = location_key[(location_key.latitude.isnull()|location_key.longitude.isnull())]
location_key_scrape['latitude_s'] = ''
location_key_scrape['longitude_s'] = ''

In [111]:
# location_key_scrape[['latitude_s','longitude_s']] = (scrape_results.loc[scrape_results!='']
#                                                      .str.split(r""", """, expand=True)
#                                                      )

In [131]:
# ssh -ND 8081 nestanmr
# ssh -ND 8082 backupsy
# ssh -ND 8083 aws
# ssh -ND 8084 vultr

port_list = [8081, 8082, 8083, 8084]

chrome_options1 = Options().add_argument("--proxy-server=socks5://127.0.0.1:" + str(port_list[0]))
chrome_options2 = Options().add_argument("--proxy-server=socks5://127.0.0.1:" + str(port_list[1]))
chrome_options3 = Options().add_argument("--proxy-server=socks5://127.0.0.1:" + str(port_list[2]))
chrome_options4 = Options().add_argument("--proxy-server=socks5://127.0.0.1:" + str(port_list[3]))

driver1 = webdriver.Chrome('/Volumes/Files/homebrew/bin/chromedriver', chrome_options=chrome_options1)
driver2 = webdriver.Chrome('/Volumes/Files/homebrew/bin/chromedriver', chrome_options=chrome_options2)
driver3 = webdriver.Chrome('/Volumes/Files/homebrew/bin/chromedriver', chrome_options=chrome_options3)
driver4 = webdriver.Chrome('/Volumes/Files/homebrew/bin/chromedriver', chrome_options=chrome_options4)

driver_list = [driver1, driver2, driver3, driver4]
location_key_scrape[['latitude_s','longitude_s']] = location_key_scrape.apply(lambda x: google_search_scrape(x, driver_list), axis=1)

Colombia+Puerto+Rico
Colombia+Colon
Colombia+Magsi
Colombia+La+Playa
Colombia+Salazar
Colombia+Santiago
Colombia+Toledo
Colombia+Albania
Colombia+Cabrera
Colombia+Cerrito
Colombia+Galan
Colombia+Gsepsa
Colombia+Jordan
Colombia+La+Paz
Colombia+Alvarado
Colombia+Ortega
Colombia+San+Antonio
Colombia+Suarez
Colombia+Dagua
Colombia+Palmira
Colombia+Trinidad
Colombia+Colon
Colombia+Leguizamo
Colombia+Santiago
Colombia+Mapiripana
Colombia+Desconocido
Colombia+Usaquen+Los+Cedros
Colombia+Usaquen+Santa+Barbara
Colombia+Barrios+Unidos+Los+Andes
Colombia+Barrios+Unidos+12+De+Octubre
Colombia+San+Cristobal+Sociego
Colombia+Puente+Aranda+San+Rafael
Colombia+Kennedy+Americas
Colombia+Kennedy+Kennedy+Central
Colombia+Tunjuelito+Tunjuelito
Colombia+Ciudad+Bolivar+San+Francisco
Colombia+Bosa+Bosa+Central
Colombia+Bosa+Tintal+Sur
Colombia+La+Candelaria+La+Candelaria
Colombia+Chapinero+Chico+Lago
Colombia+Teusaquillo+Teusaquillo
Colombia+Teusaquillo+La+Esmeralda
Colombia+Teusaquillo+Quinta+Paredes
Colomb

In [149]:
# Save a temporary checkpoint of this dataframe
pkl_it(location_key_scrape, '01_latitude_longitude_checkpoint_location_key_scrape')
pkl_it(location_key, '01_latitude_longitude_checkpoint_location_key')

In [195]:
# Convert the latitude and longitude strings to floats

location_key_scrape['latitude_s'] = location_key_scrape.latitude_s.str.strip()
location_key_scrape['longitude_s'] = location_key_scrape.longitude_s.str.strip()

location_key_scrape['latitude'] = (location_key_scrape.latitude_s
                                   .str.extract(r"""([0-9.-]+)""")
                                   .astype(float))

location_key_scrape['longitude'] = (location_key_scrape.longitude_s
                                    .str.extract(r"""([0-9.-]+)""")
                                    .astype(float))

mask = (location_key_scrape.latitude_s.str[-1]=='S')
location_key_scrape.loc[mask, 'latitude'] *= -1.0

mask = (location_key_scrape.longitude_s.str[-1]=='W')
location_key_scrape.loc[mask, 'longitude'] *= -1.0



In [196]:
# Transfer the scraped data back to the main dataframe
location_key.loc[location_key_scrape.index, 'latitude'] = location_key_scrape.latitude
location_key.loc[location_key_scrape.index, 'longitude'] = location_key_scrape.longitude

In [197]:
pkl_it(location_key, '01_latitude_longitude_checkpoint_location_key_2')

In [198]:
location_key_out = location_key[['location','latitude','longitude']]

In [199]:
save_it(location_key_out, '01_latitude_longitude_google')

In [200]:
location_key_out.latitude.shape[0], location_key_out.latitude.isnull().sum()

(1606, 104)