In [1]:
import numpy as np
import pandas as pd

from geopy.geocoders import Nominatim, Yandex
from geopy.exc import GeocoderServiceError, GeocoderTimedOut

In [2]:
def update_locations_geodata(geolocator, unique_locations, locations_geodata, bad_locations):
    for city in unique_locations:
        if (city in locations_geodata) or (city in bad_locations):
            continue

        try:
            location = geolocator.geocode(city)
        except GeocoderServiceError as e:
            print('GeocoderServiceError: {}'.format(e))
            break
        except GeocoderTimedOut as e:
            print('GeocoderTimedOut: {}'.format(e))
            break

        if location is None:
            bad_locations.append(city)
        else:
            locations_geodata[city] = location

In [3]:
user_locations = pd.read_csv('./../user_id_to_from.csv')
user_locations.rename({'from': 'city'}, axis=1, inplace=True)
user_locations.head()

Unnamed: 0,id,city
0,UE7T3UC1M,Москва
1,UE61U6DCL,Москва
2,UEF068197,Moscow
3,UE7JRC006,Краснодар
4,UE7M36F7Y,Samara


In [4]:
user_locations['city'].unique().shape

(2101,)

In [5]:
locations_geodata = dict()
bad_locations = []

In [44]:
# execute until get all data (remember to geolocator limits!)

#geolocator = Nominatim(user_agent='aborisihin')
geolocator = Yandex()

update_locations_geodata(geolocator, user_locations['city'].unique(), locations_geodata, bad_locations)

print('locations_geodata: {}, bad_locations: {}'.format(len(locations_geodata), len(bad_locations)))

complete_flag = (len(locations_geodata) + len(bad_locations) == len(user_locations['city'].unique()))
print('complete flag: {}'.format(complete_flag))

locations_geodata: 2041, bad_locations: 60
complete flag: True


In [32]:
def geodata_value_osm(location_string, key):
    if location_string in locations_geodata:
        return locations_geodata[location_string].raw[key]
    else:
        return None
    
def geodata_value_yandex(location_string, key):
    if location_string in locations_geodata:
        if (key == 'text') or (key == 'kind'):
            return locations_geodata[location_string].raw['metaDataProperty']['GeocoderMetaData'][key]
        elif key == 'lat':
            return locations_geodata[location_string].raw['Point']['pos'].split(' ')[0]
        elif key == 'lon':
            return locations_geodata[location_string].raw['Point']['pos'].split(' ')[1]
        
    return None

In [33]:
# OpenStreetMaps
# user_locations['geolocation_name'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'display_name'))
# user_locations['geolocation_type'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'type'))
# user_locations['geolocation_lat'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'lat'))
# user_locations['geolocation_lon'] = user_locations['city'].apply(lambda x: geodata_value_osm(x, 'lon'))

# Yandex
user_locations['geolocation_name'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'text'))
user_locations['geolocation_type'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'kind'))
user_locations['geolocation_lat'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'lat'))
user_locations['geolocation_lon'] = user_locations['city'].apply(lambda x: geodata_value_yandex(x, 'lon'))

In [39]:
user_locations.head()

Unnamed: 0,id,city,geolocation_name,geolocation_type,geolocation_lat,geolocation_lon
0,UE7T3UC1M,Москва,"Россия, Москва",province,37.622504,55.753215
1,UE61U6DCL,Москва,"Россия, Москва",province,37.622504,55.753215
2,UEF068197,Moscow,"Россия, Москва",locality,37.617635,55.755814
3,UE7JRC006,Краснодар,"Россия, Краснодар",locality,38.975313,45.03547
4,UE7M36F7Y,Samara,"Россия, Самара",locality,50.101783,53.195538


In [42]:
#CSV_PATH = './user_locations_osm.csv'
CSV_PATH = './user_locations_yandex.csv'

user_locations.to_csv(CSV_PATH, index=False)