In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# import shapely
# from shapely.geometry import Point

import warnings
warnings.filterwarnings('ignore')

In [2]:
cols = ['Jurisdicción','CUE Anexo','Nombre','Sector','Estado','Ámbito','Domicilio','CP','Teléfono','Código Localidad','Localidad','Departamento','E-mail','Ed. Común','Ed. Especial','Ed. de Jóvenes y Adultos','Ed. Artística','Ed. Hospitalaria Domiciliaria','Ed. Intercultural Bilingüe','Ed. Contexto de Encierro','Jardín maternal','Jardín de infantes','Primaria','Secundaria','Secundaria Técnica (INET)','Superior no Universitario','Superior No Universitario (INET)']

escuelas = pd.read_csv('../../datos/escuelas_arg.csv', names=cols).fillna(False).replace('X', True)

escuelas['dpto_link'] = escuelas['C\xc3\xb3digo Localidad'].astype(str).str.zfill(8).str[:5]


In [3]:
radios_censales_AMBA = pd.read_csv('../../datos/AMBA_datos', dtype=object)
dpto_links_AMBA = (radios_censales_AMBA['prov'] + radios_censales_AMBA['depto']).unique()

escuelas_AMBA = escuelas.loc[escuelas['dpto_link'].isin(dpto_links_AMBA)]
escuelas_AMBA = pd.concat([escuelas_AMBA, escuelas.loc[escuelas['Jurisdicci\xc3\xb3n'] == 'Ciudad de Buenos Aires']])

escuelas_AMBA_secundaria_estatal = escuelas_AMBA.loc[escuelas_AMBA['Secundaria'] & (escuelas_AMBA[u'Sector'] == 'Estatal')]
escuelas_AMBA_secundaria_estatal.reset_index(inplace=True, drop=True)

In [4]:
escuelas_AMBA_secundaria_estatal['Address'] = \
escuelas_AMBA_secundaria_estatal['Domicilio'].astype(str) + ', ' + \
escuelas_AMBA_secundaria_estatal['Localidad'].astype(str) + ', ' + \
escuelas_AMBA_secundaria_estatal['Departamento'].astype(str) + ', ' + \
escuelas_AMBA_secundaria_estatal['Jurisdicci\xc3\xb3n'].astype(str) +', Argentina'

In [5]:
import re

def filtrar_entre_calles(string):
    s = string.lower()
    try:
        m = re.search("\d", s)
        start = s.index( 'e/' )
#         end = s.index( last, start )
        end = m.start()
        return string[:start] + string[end:]
    except:
        return string
    
def filtrar_barrio(string):
    try:
        coma_partido_jurisdiccion =  [m.start() for m in re.finditer(',', string)][-3]
        coma_direccion =  [m.start() for m in re.finditer(',', string)][0]

        s = string[:coma_direccion][::-1]
        
        if "n/s" in s.lower():
            start = s.lower().index('n/s')
            cut = len(s) - len('n/s') - start

        else:    
            m = re.search("\d", s)
            cut = len(s) - m.start(0)

        return string[:cut] + string[coma_partido_jurisdiccion:]
    except AttributeError:
        return string

escuelas_AMBA_secundaria_estatal['Address_2'] = escuelas_AMBA_secundaria_estatal['Address'].apply(filtrar_entre_calles)
escuelas_AMBA_secundaria_estatal['Address_3'] = escuelas_AMBA_secundaria_estatal['Address_2'].apply(filtrar_barrio)

escuelas_AMBA_secundaria_estatal.to_csv('../../datos/escuelas_AMBA_secundaria_estatal.csv', index = False)

In [6]:
import json
import time
import urllib
import urllib2

def geolocate(inp, API_key = None, BACKOFF_TIME = 30):

    # See https://developers.google.com/maps/documentation/timezone/get-api-key
#     with open('googleMapsAPIkey.txt', 'r') as myfile:
#         maps_key = myfile.read().replace('\n', '')
    
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json'

    # This joins the parts of the URL together into one string.
    url = base_url + '?' + urllib.urlencode({
        'address': "%s" % (inp),
        'key': API_key,
    })
    
    try:
        # Get the API response.
        response = str(urllib2.urlopen(url).read())
    except IOError:
        pass  # Fall through to the retry loop.
    else:
        # If we didn't get an IOError then parse the result.
        result = json.loads(response.replace('\\n', ''))
        if result['status'] == 'OK':
            return result['results'][0]
        elif result['status'] != 'UNKNOWN_ERROR':
            # Many API errors cannot be fixed by a retry, e.g. INVALID_REQUEST or
            # ZERO_RESULTS. There is no point retrying these requests.
#             raise Exception(result['error_message'])
            return None
        # If we're over the API limit, backoff for a while and try again later.
        elif result['status'] == 'OVER_QUERY_LIMIT':
            print "Hit Query Limit! Backing off for "+str(BACKOFF_TIME)+" minutes..."
            time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
            geocoded = False

def set_geolocation_values(df, loc):
    df.set_value(i,'lng', loc['geometry']['location']['lng'])
    df.set_value(i,'lat', loc['geometry']['location']['lat'])
    df.set_value(i, 'id', loc['place_id'])

In [7]:
dataframe = escuelas_AMBA_secundaria_estatal
col, col_2, col_3 = 'Address', 'Address_2', 'Address_3'
API_key = 'AIzaSyDjBFMZlNTyds2Sfihu2D5LTKupKDBpf6c'

for i, row in dataframe.iterrows():
    loc = geolocate(row[col], API_key)
    if loc:
        set_geolocation_values(dataframe, loc)
    else:
        loc = geolocate(row[col_2], API_key)
        if loc:
            set_geolocation_values(dataframe, loc)
        else:
            loc = geolocate(row[col_3], API_key)
            if loc:
                set_geolocation_values(dataframe, loc)
             
    if i%50 == 0:
        print 'processed row '+str(i)
        
dataframe.to_csv('../../datos/esc_sec_AMBA_geoloc.csv', index = False, encoding = 'utf8')


processed row 0
processed row 50
processed row 100
processed row 150
processed row 200
processed row 250
processed row 300
processed row 350
processed row 400
processed row 450
processed row 500
processed row 550
processed row 600
processed row 650
processed row 700
processed row 750
processed row 800
processed row 850
processed row 900
processed row 950
processed row 1000
processed row 1050
processed row 1100
processed row 1150
processed row 1200
processed row 1250
processed row 1300
processed row 1350
processed row 1400
processed row 1450
processed row 1500
processed row 1550
processed row 1600
processed row 1650
processed row 1700
processed row 1750
processed row 1800
processed row 1850
processed row 1900
processed row 1950
processed row 2000
processed row 2050


In [8]:
# # dataframe.lat

# for i, row in dataframe.iterrows():
#     print row[col]
#     loc = geolocate(row[col], API_key='AIzaSyDjBFMZlNTyds2Sfihu2D5LTKupKDBpf6c')
# dataframe

In [9]:
# loc
# set_geolocation_values(dataframe, loc)

In [10]:
# dataframe

In [11]:
# dataframe = escuelas_AMBA_secundaria_estatal
# col, col_2, col_3 = 'Address', 'Address_2', 'Address_3'

# for i, row in dataframe.iterrows():
#     loc = geolocate(row[col])
#     if loc:
#         set_geolocation_values(dataframe, loc)
#     else:
#         loc = geolocate(row[col_2])
#         if loc:
#             set_geolocation_values(dataframe, loc)
#         else:
#             loc = geolocate(row[col_3])
#             if loc:
#                 set_geolocation_values(dataframe, loc)
             
#     if i%10 == 0:
#         print 'processed row '+str(i)
        
# dataframe.to_csv('esc_sec_AMBA_geoloc.csv', index = False, encoding = 'utf8')
# # dataframe = dataframe.dropna()