In [1]:
import re
import pandas as pd
from geonamescache import GeonamesCache
from unidecode import unidecode

headlines = './data/headlines.txt'

class LocationExtractor:
    """Class to pull City and Country names from a string"""
    
    geocache = GeonamesCache()
    city_regex = None
    country_regex = None
    countries = {}
    
    def __init__(self):
        gc = GeonamesCache()
        self.countries = self.geocache.get_countries_by_names()
        cities = self.geocache.get_cities()
        
        country_names = []
        city_names = []
        
        for c in cities.values():
            city_names.append( r"\b" + unidecode( c['name'] + r"\b" ) )

        for cname in self.countries.keys():
            country_names.append( r"\b" + unidecode( cname ).replace( ' ', r"\s" ) + r"\b" )
            
        self.city_regex = self.regex_from_array( city_names )
        self.country_regex = self.regex_from_array( country_names )
    
    def find_cities( self, city_str ):
        found_cities = []
        matches = self.city_regex.search( city_str )
        if( matches ):
            for c in matches.groups():
                for x in self.geocache.get_cities_by_name( c ):
                    for a, b in x.items():
                        found_cities.append( b )
        return found_cities
    
    def find_countries( self, country_str ):
        """Searches a string for names of countries and returns the name"""
        found_countries = []
        matches = self.country_regex.search( country_str )
        if( matches ):
            for c in matches.groups():
                found_countries.append( self.countries[c] )
        return found_countries
            
    def regex_from_array( self, rarray ):
        # ensure unique & sort by length (so Witchita Falls is tried before Wichita)
        rarray = sorted( list( set( rarray ) ), key=len)
        rarray.reverse()

        regex = "("
        regex += "|".join( rarray )
        regex += ')'
        rc = re.compile( regex, re.IGNORECASE )
        return rc
    
    def find_location(self, sentence ):
        """Given a string return a tuple (city, country, latitude, longitude) of the location in it"""
        retval = (sentence, None, None, None, None)
        key_city = None
        
        countries = self.find_countries( sentence )
        cities = self.find_cities( sentence )
        select_cities = []
        if( countries ):
            if( len(cities) < 1 ):
                #print( f"No city found for {countries}")
                caps = [c['capital'] for c in countries]
                for c in caps:
                    #FIXME - breaks on multiple countries
                    cities = self.find_cities( c )
            codes = [c['iso'] for c in countries]
            select_cities = [ct for ct in cities if ct['countrycode'] in codes ]
            #print( f"Cities in {codes}:\n{select_cities}")
        elif(cities):
            #city, but no country found
            select_cities = cities
            
        if( len( select_cities) == 1 ):
            key_city = select_cities.pop()
        elif( len(select_cities) > 1):
            # need to find the actual real city
            print( f"{len(select_cities)} cities found!")
        
        if( key_city):
             retval = (sentence, key_city['name'], key_city['countrycode'], key_city['latitude'], key_city['longitude'])
        
        return retval


In [2]:
le = LocationExtractor()
rows = []

In [3]:
for line in open( headlines ):
    rows.append( le.find_location( line.strip() ) )

2 cities found!
3 cities found!
3 cities found!
5 cities found!
2 cities found!
2 cities found!
2 cities found!
2 cities found!
2 cities found!
6 cities found!
5 cities found!
2 cities found!
2 cities found!
2 cities found!
5 cities found!
4 cities found!
2 cities found!
2 cities found!
4 cities found!
5 cities found!
5 cities found!
2 cities found!
5 cities found!
2 cities found!
3 cities found!
2 cities found!
2 cities found!
2 cities found!
2 cities found!
2 cities found!
2 cities found!
5 cities found!
3 cities found!
4 cities found!
2 cities found!
2 cities found!
2 cities found!
3 cities found!
2 cities found!
2 cities found!
3 cities found!
2 cities found!
4 cities found!
4 cities found!
4 cities found!
2 cities found!
3 cities found!
2 cities found!
4 cities found!
5 cities found!
3 cities found!
5 cities found!
2 cities found!
2 cities found!
3 cities found!
4 cities found!
3 cities found!
2 cities found!
3 cities found!
2 cities found!
5 cities found!
7 cities found!
3 cities

In [4]:
df = pd.DataFrame( rows, columns = ( "headline", "city", "country", "latitude", "longitude" ) )
print( df )

                                              headline           city country  \
0                             Zika Outbreak Hits Miami          Miami      US   
1                      Could Zika Reach New York City?  New York City      US   
2                    First Case of Zika in Miami Beach           None    None   
3              Mystery Virus Spreads in Recife, Brazil         Recife      BR   
4              Dallas man comes down with case of Zika           None    None   
..                                                 ...            ...     ...   
645  Rumors about Rabies spreading in Jerusalem hav...      Jerusalem      IL   
646              More Zika patients reported in Indang         Indang      PH   
647  Suva authorities confirmed the spread of Rotav...           Suva      FJ   
648         More Zika patients reported in Bella Vista           None    None   
649                     Zika Outbreak in Wichita Falls  Wichita Falls      US   

     latitude  longitude  


In [None]:
# need some dataframe metrics in here