In [1]:
import re
import pandas as pd
from geonamescache import GeonamesCache
from unidecode import unidecode
from operator import itemgetter

headlines = './data/headlines.txt'

class LocationExtractor:
    """Class to pull City and Country names from a string"""
    
    geocache = GeonamesCache()
    city_regex = None
    country_regex = None
    countries = {}
    country_codes = {}
    
    def __init__(self):
        gc = GeonamesCache()
        self.countries = self.geocache.get_countries_by_names()
        cities = self.geocache.get_cities()
        
        country_names = []
        city_names = []
        
        for c in cities.values():
            city_names.append( r"\b" + unidecode( c['name'].strip() ).replace(' ', r"\s" ) + r"\b" )

        for cname in self.countries.keys():
            country_names.append( r"\b" + unidecode( cname.strip() ).replace( ' ', r"\s" ) + r"\b" )
            self.country_codes[self.countries[cname]['iso']] = cname
            
        self.city_regex = self.regex_from_array( city_names )
        self.country_regex = self.regex_from_array( country_names )
        #print( f"Regex has len {len(str(self.city_regex))} and looks like\n\n{self.city_regex}" )
    
    def find_cities( self, city_str ):
        found_cities = []
        matches = self.city_regex.search( city_str )
        if( matches ):
            for c in matches.groups():
                for x in self.geocache.get_cities_by_name( c ):
                    for a, b in x.items():
                        found_cities.append( b )
        return found_cities
    
    def find_countries( self, country_str ):
        """Searches a string for names of countries and returns the name"""
        found_countries = []
        matches = self.country_regex.search( country_str )
        if( matches ):
            for c in matches.groups():
                found_countries.append( self.countries[c] )
        return found_countries
            
    def regex_from_array( self, rarray ):
        # ensure unique & sort by length (so Witchita Falls is tried before Wichita)
        rarray = sorted( list( set( rarray ) ), key=len)
        rarray.reverse()

        regex = "("
        regex += "|".join( rarray )
        regex += ')'
        rc = re.compile( regex, re.IGNORECASE )
        return rc
    
    def select_key_city( self, cities ):
        """Determines most important from a list of cities. 
        Prioritizes capital cities 1st, then greatest population"""
        prospects = []
        # See which are capitals
        for c in cities:
            country = self.countries[ self.country_codes[ c['countrycode'] ] ]
            if( c['name'] == country['capital']):
                prospects.append( c )
        if( len( prospects ) > 0 ):
            cities = prospects
        
        if( len( cities ) > 1):
            # sort by pop, but later
            cities = sorted( cities, key=itemgetter('population') )
        key = cities.pop()
        return key
    
    def find_location(self, sentence ):
        """Given a string return a tuple (string, city, country, latitude, longitude) of the location in it"""
        retval = (sentence, None, None, None, None)
        key_city = None
        
        countries = self.find_countries( sentence )
        cities = self.find_cities( sentence )
        select_cities = []
        if( countries ):
            if( len(cities) < 1 ):
                #print( f"No city found for {countries}")
                caps = [c['capital'] for c in countries]
                for c in caps:
                    #FIXME - breaks on multiple countries
                    cities = self.find_cities( c )
            codes = [c['iso'] for c in countries]
            select_cities = [ct for ct in cities if ct['countrycode'] in codes ]
            #print( f"Cities in {codes}:\n{select_cities}")
        elif(cities):
            #city, but no country found
            select_cities = cities
            
        if( len( select_cities) == 1 ):
            key_city = select_cities.pop()
        elif( len(select_cities) > 1):
            # need to find the actual real city
            #print( f"{len(select_cities)} cities found!")
            key_city = self.select_key_city( select_cities )
        
        if( key_city):
             retval = (sentence, key_city['name'], key_city['countrycode'], key_city['latitude'], key_city['longitude'])
        
        return retval


In [2]:
le = LocationExtractor()
rows = []
count = 0
found = 0

In [3]:
for line in open( headlines ):
    tup = le.find_location( unidecode( line.strip() ) )
    if( tup[1] ):
        found += 1
    count += 1
    rows.append( tup )

In [4]:
df = pd.DataFrame( rows, columns = ( "headline", "city", "country", "latitude", "longitude" ) )
df.head(30)

Unnamed: 0,headline,city,country,latitude,longitude
0,Zika Outbreak Hits Miami,Miami,US,25.77427,-80.19366
1,Could Zika Reach New York City?,New York City,US,40.71427,-74.00597
2,First Case of Zika in Miami Beach,,,,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,BR,-8.05389,-34.88111
4,Dallas man comes down with case of Zika,Dallas,US,32.78306,-96.80667
5,Trinidad confirms first Zika case,Trinidad,BO,-14.83333,-64.9
6,Zika Concerns are Spreading in Houston,Houston,US,29.76328,-95.36327
7,Geneve Scientists Battle to Find Cure,,,,
8,The CDC in Atlanta is Growing Worried,Atlanta,US,33.749,-84.38798
9,Zika Infested Monkeys in Sao Paulo,,,,


In [5]:
# need some dataframe metrics in here
print( f"Found {found} cities in {count} rows")

Found 502 cities in 650 rows
