This notebook was used to build geo database. A csv from the allCountries1500.txt (geographical info from places with more than 1500 people) was downloaded from http://www.geonames.org/ and 4 dictionries are built:
        

geonameid_info_dict = id vs all info related to id (country name, population, etc.)
name_geonameidLIST_dict = oficial name of place vs ids of all places with that name
asciiname_geonameidLIST_dict = ascii name of place vs ids of all places with that name
altnames_geonameidLIST_dict = alternative names of places vs ids of all places with that name

In [1]:
import os
from collections import namedtuple, Counter, OrderedDict
import re
import datetime
import pickle
from itertools import groupby

In [2]:
geonameid_info_dict = dict()
sep='\t'
encoding='utf-8'
filename = '../geonames/allCountries1500.txt'
with open(filename, 'rb') as f:    
    for line in f:
        line = map(lambda x: x.decode(encoding), line.replace('\n', '').split(sep))
        fields = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 
                  'feature class', 'feature code', 'country code', 'cc2', 'admin1 code', 
                  'admin2 code', 'admin3 code', 'admin4 code', 'population', 'elevation', 
                  'dem', 'timezone', 'modification date']
        insert_dict = {}
        geonameid = int(line[0])
        if not line[1]=='': insert_dict[fields[1]] = line[1]
        if not line[2]=='': insert_dict[fields[2]] = line[2]
        if line[3]=='':
            insert_dict[fields[3]] = []
        else:
            insert_dict[fields[3]] = line[3].split(',')
        if not line[4]=='': insert_dict[fields[4]] = float(line[4])
        if not line[5]=='': insert_dict[fields[5]] = float(line[5])
        if not line[6]=='': insert_dict[fields[6]] = line[6]
        if not line[7]=='': insert_dict[fields[7]] = line[7]
        if not line[8]=='': insert_dict[fields[8]] = line[8]
        if not line[9]=='': insert_dict[fields[9]] = line[9].split(',')
        if not line[10]=='': insert_dict[fields[10]] = line[10]
        if not line[11]=='': insert_dict[fields[11]] = line[11]
        if not line[12]=='': insert_dict[fields[12]] = line[12]
        if not line[13]=='': insert_dict[fields[13]] = line[13]
        if not line[14]=='': insert_dict[fields[14]] = int(line[14])
        if not line[15]=='': insert_dict[fields[15]] = int(line[15])
        if not line[16]=='': insert_dict[fields[16]] = int(line[16])
        if not line[17]=='': insert_dict[fields[17]] = line[17]
        if not line[18]=='': insert_dict[fields[18]] = datetime.datetime(*map(lambda x: int(x), line[18].split('-')))
        geonameid_info_dict[geonameid] = insert_dict

In [3]:
print("There are {} registries in dictionary".format(len(geonameid_info_dict.keys())))

There are 47151 registries in dictionary


# Create 3 search name dictionary: name, asciiname, alternative names

name_geonameidLIST_dict is a dictionary with key = name of geographycal point and value = list of geonameids

In [9]:
gid_n_list = [(x, geonameid_info_dict[x]['name'].lower()) for x in  geonameid_info_dict.keys()]
gid_n_list.sort(key = lambda s: s[1])

name_geonameidLIST_dict = dict()
for key,valuesiter in groupby(gid_n_list, key=lambda s: s[1]):
    name_geonameidLIST_dict[key] = [v[0] for v in valuesiter]

asciiname_geonameidLIST_dict is a dictionary with key = ascii name of geographycal point and value = list of geonameids

In [10]:
gid_n_list = [(x, geonameid_info_dict[x]['asciiname'].lower()) for x in  geonameid_info_dict.keys()]
gid_n_list.sort(key = lambda s: s[1])

asciiname_geonameidLIST_dict = dict()
for key,valuesiter in groupby(gid_n_list, key=lambda s: s[1]):
    asciiname_geonameidLIST_dict[key] = [v[0] for v in valuesiter]

altnames_geonameidLIST_dict is a dictionary with key = alternative name of geographycal point and value = list of geonameids

In [11]:
gid_n_list = []
for key in geonameid_info_dict.keys():
    for altname in geonameid_info_dict[key]['alternatenames']:
        gid_n_list += [(key, altname.lower())]

gid_n_list.sort(key = lambda s: s[1])

altnames_geonameidLIST_dict = dict()
for key,valuesiter in groupby(gid_n_list, key=lambda s: s[1]):
    altnames_geonameidLIST_dict[key] = [v[0] for v in valuesiter]

In [13]:
print("there are {}, {}, {} items in name, asciiname and altnames dictionaries".format(len(name_geonameidLIST_dict.keys()), 
                                                                                     len(asciiname_geonameidLIST_dict.keys()), 
                                                                                     len(geonameid_info_dict.keys())))


there are 39092, 38911, 47151 items in name, asciiname and altnames dictionaries


Finally, lets save all dictionaries to file in pickle format

In [14]:
pickle.dump(geonameid_info_dict, open('../geonames/geonameid_info_dict.p','w'))
pickle.dump(name_geonameidLIST_dict, open('../geonames/name_geonameidLIST_dict.p','w'))
pickle.dump(asciiname_geonameidLIST_dict, open('../geonames/asciiname_geonameidLIST_dict.p','w'))
pickle.dump(altnames_geonameidLIST_dict, open('../geonames/altnames_geonameidLIST_dict.p','w'))

In [None]:
#To load!
geonameid_info_dict = pickle.load(open('../geonames/geonameid_info_dict.p'))
name_geonameidLIST_dict = pickle.dump(open('../geonames/name_geonameidLIST_dict.p','w'))
asciiname_geonameidLIST_dict = pickle.dump(open('../geonames/asciiname_geonameidLIST_dict.p','w'))
altnames_geonameidLIST_dict = pickle.dump(open('../geonames/altnames_geonameidLIST_dict.p','w'))