In [1]:
import re
import pandas as pd
from geonamescache import GeonamesCache
from unidecode import unidecode

headlines = './data/headlines.txt'

class LocationExtractor:
    """Class to pull City and Country names from a string"""
    
    geocache = GeonamesCache()
    city_regex = None
    country_regex = None
    
    def __init__(self):
        gc = GeonamesCache()
        countries = self.geocache.get_countries()
        cities = self.geocache.get_cities()
        
        country_names = []
        city_names = []
        
        for c in cities.values():
            city_names.append( r"\b" + unidecode( c['name'] + r"\b" ) )

        for c in countries.values():
            country_names.append( r"\b" + unidecode( c['name'] ).replace( ' ', r"\s" ) + r"\b" )

        self.city_regex = self.regex_from_array( city_names )
        self.country_regex = self.regex_from_array( country_names )
    
    def find_cities( self, city_str ):
        found_cities = []
        matches = self.city_regex.search( city_str )
        if( matches ):
            for c in matches.groups():
                if( len( self.geocache.get_cities_by_name( c ) ) > 0 ):
                    found_cities.append( c )
        return found_cities
    
    def find_countries( self, country_str ):
        found_countries = []
        matches = self.country_regex.search( country_str )
        if( matches ):
            for c in matches.groups():
                found_countries.append( c )
        return found_countries
            
    def regex_from_array( self, rarray ):
        # ensure unique & sort by length (so Witchita Falls is tried before Wichita)
        rarray = sorted( list( set( rarray ) ), key=len)
        rarray.reverse()

        regex = "("
        regex += "|".join( rarray )
        regex += ')'
        rc = re.compile( regex, re.IGNORECASE )
        return rc
    


In [2]:
le = LocationExtractor()
rows = []

In [3]:
for line in open( headlines ):
    cities = le.find_cities( line )
    countries = le.find_countries( line )
    rows.append( (line, ",".join( countries), ",".join( cities ) ) )

In [4]:
df = pd.DataFrame( rows, columns = ( "headline", "countries", "cities" ) )
print( df )

                                              headline countries  \
0                           Zika Outbreak Hits Miami\n             
1                    Could Zika Reach New York City?\n             
2                  First Case of Zika in Miami Beach\n             
3            Mystery Virus Spreads in Recife, Brazil\n    Brazil   
4            Dallas man comes down with case of Zika\n             
..                                                 ...       ...   
645  Rumors about Rabies spreading in Jerusalem hav...             
646            More Zika patients reported in Indang\n             
647  Suva authorities confirmed the spread of Rotav...             
648       More Zika patients reported in Bella Vista\n             
649                   Zika Outbreak in Wichita Falls\n             

            cities  
0            Miami  
1    New York City  
2                   
3           Recife  
4           Dallas  
..             ...  
645      Jerusalem  
646         Ind