In [1]:
import geonamescache
import re
from unidecode import unidecode
import pandas as pd
import numpy as np

gc = geonamescache.GeonamesCache()
countries = gc.get_countries()

In [4]:
EXCLUDE_CITIES=['Bay', 'New', 'San', 'Can', 'Hit', 'Spring', 'North', 'South', 'East', 'West', 'Spanish']

MIN_NAME_LENGTH=3

def add_city(city_keys, city_names, key, name):
    if len(name) < MIN_NAME_LENGTH:
        return
    
    city_keys[ name.lower() ] = key
    city_names.append(name)
        
def get_cities(udecode=True, verbose=False):
    cities = gc.get_cities()

    cities_regex=""
    city_keys={}
    city_names=[]
    for k in cities.keys():
        entry = cities[k]
        if udecode:
            name = unidecode( entry['name'] )
        if name in EXCLUDE_CITIES:
            #print("Excluding: " + name)
            continue
        
        if verbose: print(name)

        if name == 'Spanish':  print(entry)
        if name == 'Richland':
            add_city(city_keys, city_names, k, name + ' Center')
            #print(entry)
        add_city(city_keys, city_names, k, name)
       
        cityindex=name.lower().rfind(' city')
        if cityindex != -1:
            name = name[:cityindex]
        else:
            if cityindex == -1:            
                cityindex=name.lower().rfind(' town')

        if cityindex != -1:
            name = name[:cityindex]
            if not name in EXCLUDE_CITIES:
                #if name == 'Spanish': print(f'MATHED SPANISH ON <{cities[k]}>')
                #if name == 'Spanish': print(k)
                add_city(city_keys, city_names, k, name)
                if verbose: print("ADDDING town/city: " + name)
            
    #city_names.sort() # sorts normally by alphabetical order
    city_names.sort(key=len, reverse=True) # sorts by descending length

    cities_regex ="|".join(city_names)
    
    cities_regex = r'\b(' + cities_regex + r')\b'
    compiled_re = re.compile(cities_regex)
    icase_compiled_re = re.compile(cities_regex, re.IGNORECASE)
        
    return (cities, city_keys, cities_regex, compiled_re, icase_compiled_re) 
        
(cities, city_keys, cities_regex, compiled_cities_re, icase_compiled_cities_re) = get_cities(udecode=True, verbose=False)

In [6]:
def get_city_country(line, match):
    start = match.start()
    end   = match.end()
    city=line[start:end]
    #print("'" + city + "'")
    city_key=city_keys[city.lower()]
    country=cities[city_key]['countrycode']
    #print(f'{city} => {city_key} => {country} [{cities[city_key]}]')
    if country == 'US': country=np.NaN
    
    return (city, country, start, end)

def get_US_states():
    US_states = list(map(str.lower, list(gc.get_us_states_by_names().keys())))
    US_states_regex = r'\b(' + "|".join(US_states) + r')\b'
    US_states_compiled_re = re.compile(US_states_regex, re.IGNORECASE)
    return (US_states, US_states_regex, US_states_compiled_re)

#l = list(map(str.lower,["A","B","C"]))
#print(l)

(US_states, US_states_regex, US_states_compiled_re) = get_US_states()

In [7]:
def read_headlines(regex, iregex, state_iregex, ifile="data/headlines.txt"):
    file = open(ifile, 'r')

    data = []
    for line in file.readlines():
    #for line in [ "Zika Outbreak Hits Miami"]:
        line=unidecode(line)
        line=line.rstrip()
        #print(line)
        
        match = regex.search(line)
        if match == None:
            # Handle special cases: case mismatch, 'XXX city' or 'XXX town'
            #print("Doing ignorecase match on <" + line + ">")
            # FAILS: match = regex.search(line, re.IGNORECASE)
            match = iregex.search(line)
            
        if match == None:
            match = state_iregex.search(line)
            if match == None:
                print("No city match on line <" + line + ">")
            else:
                print("Matched state - but ignoring anyway on line <" + line + ">")
                match = None
        
        if match != None:
            (city, country, s1, e1) = get_city_country(line, match)
            #print(line + ", " + city + ", " + country)
            #data.append( [line, city, country] )
            data.append( [line, country, city ] )
            match2 = regex.search(line[e1:])
            if match2 != None:
                s2 = e1 + match2.start()
                e2 = e1 + match2.end()
                (city2, country2, x, y) = get_city_country(line[e1:], match2)
                #if city2 != 'Bay' and city2 != city:
                if not city2.lower() in US_states and city2 != city:
                    print("====> Two cities seen in <" + line + ">")
                    print("1st<" + str(s1) + ":" + str(e1) + ">:" + city + ", " + country)
                    print("2nd<" + str(s2) + ":" + str(e2) + ">:" + city2 + ", " + country2)
                    #data.append( [line, city2, country2] )
                    data.append( [line, country2, city2 ] )
                    #break
            pass

    file.close()
    
    df = pd.DataFrame(columns=['headline','city','country'], data=data)
    df = pd.DataFrame(columns=['headline','countries','cities'], data=data)
    df.replace('US',np.NaN)
    
    return df

df = read_headlines(regex=compiled_cities_re, iregex=icase_compiled_cities_re, state_iregex=US_states_compiled_re, ifile="data/headlines.txt")

Matched state - but ignoring anyway on line <Louisiana Zika cases up to 26>
No city match on line <Spanish Flu Sighted in Antigua>
No city match on line <Zika case reported in Oton>
No city match on line <Hillsborough uses innovative trap against Zika 20 minutes ago>
No city match on line <Maka City Experiences Influenza Outbreak>
No city match on line <West Nile Virus Outbreak in Saint Johns>
No city match on line <Malaria Exposure in Sussex>
No city match on line <Greenwich Establishes Zika Task Force>
No city match on line <Will West Nile Virus vaccine help Parsons?>
No city match on line <Yulee takes a hit from Spreading Sickness>
No city match on line <The Spread of Chikungunya in Davidson has been Confirmed>
No city match on line <Zika case reported in Los Fresnos>
No city match on line <More people in Boucau are infected with HIV every year>
No city match on line <Bronchitis Outbreak in Manhasset>
No city match on line <Rumors about Influenza Spreading in Dobbs Ferry have been R

In [8]:
df

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",BR,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,UY,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,CH,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,BR,Sao Paulo
