In [1]:
import pandas as pd
import re
from geonamescache import GeonamesCache
import unidecode
import numpy as np

In [2]:
gc = GeonamesCache()

### Getting the headlines into a list

In [3]:
headlines_list = []
with open("headlines.txt", "r+") as f:
    for line in f:
        pattern_n = r"[^\n]+"
        search = re.search(pattern_n, line)
        headlines_list.append(search.group())

### Extracting Cities Case one

A common pattern in the heaslines is that a city or place name is followed by the word 'in', the following code extracts such places.

In [4]:
pattern = re.compile(r'([\w]+)(\b in \b)([\w|St.]+)(\s*)([\w]*)')

In [5]:
preliminary_cities = []
preliminary_headlines = []
for headline in headlines_list:
    p_search = re.search(pattern, headline)
    if (p_search is not None):
        preliminary_cities.append(p_search.group().split(' in ')[-1])
        preliminary_headlines.append(headline)

In [6]:
preliminary_cities = np.array(preliminary_cities)
preliminary_cities = np.where(preliminary_cities == 'Atlanta is', 'Atlanta', preliminary_cities)
preliminary_cities = preliminary_cities.tolist()

In [7]:
len(preliminary_cities)

410

In [8]:
len(preliminary_headlines)

410

In [9]:
len(headlines_list)

650

#### Cleaning up the cities from the intial search

In [10]:
cleaned_cities_one = []
for city in preliminary_cities:
    pattern = r'\b[A-Z]\w*\b'
    search = re.findall(pattern, city)
    cleaned_cities_one.append(' '.join(search))
    

### Extracting Cities : Case Two

For the headlines that don't contain the pattern mentioned above, it can be noted that they are either the first or the last word, this next search will be using geonamescache to find such cities.

In [11]:
headlines_without_in = []

for headline in headlines_list:
    if ' in ' not in headline:
        headlines_without_in.append(headline)



In [12]:
pattern = r'[^\bin\b]'


In [13]:
string = "I am in trouble"

In [14]:
re.search(pattern, string)

<re.Match object; span=(0, 1), match='I'>

In [15]:
len(headlines_without_in)

240

In [16]:
first_word_cities = []
good_headlines = []
for headline in headlines_without_in:
    if headline == 'San Juan reports 1st U.S. Zika-related death amid outbreak':
        continue
    f = headline.split(" ")[0]
    cf = re.search(r'[\w]+', f).group()
    if gc.get_cities_by_name(cf):
        first_word_cities.append(cf)
        good_headlines.append(headline)

In [17]:
chi = "Chicago's"

In [18]:
pattern = r'[\w]+'

In [19]:
re.search(pattern, chi).group()

'Chicago'

In [20]:
len(first_word_cities)

94

In [21]:
last_word_cities = []
good_headlines_2 = []
for headline in headlines_without_in:
    f = re.findall(r'[^\W]+', headline)[-1]
    
    
    if gc.get_cities_by_name(f):
        last_word_cities.append(f)
        good_headlines_2.append(headline)

In [22]:
good_headlines += good_headlines_2

In [23]:
len(good_headlines)

161

In [24]:
len(good_headlines_2)

67

In [25]:
cities_second_case = first_word_cities + last_word_cities

In [26]:
cities_second_case = np.array(cities_second_case)
cities_second_case = np.where(cities_second_case == 'Orleans', 'New Orleans', cities_second_case)
cities_second_case = np.where(cities_second_case == 'Kingston', 'New Kingston', cities_second_case)
cities_second_case = np.where(cities_second_case == 'Bethesda', 'Norht Bethesda', cities_second_case)
cities_second_case = np.where(cities_second_case == 'San', 'San Juan', cities_second_case)
cities_second_case = cities_second_case.tolist()

ValueError: list.remove(x): x not in list

In [28]:
len(cities_second_case)

161

In [29]:
test_dict ={'Headline': good_headlines, 'City': cities_second_case}
test_df = pd.DataFrame.from_dict(test_dict)
test_df.to_csv('test.csv')

That concludes the extraction of cities which were either first words of the headlines, or the last word.

### Extracting Cities : Case 3

The city names that have not been extracted so far are the ones that contain spaces between them, and their headline didn't have the pattern of 'in'. Such headlines will be used for city extraction

In [30]:
from unidecode import unidecode

In [31]:
headlines_case_three = [unidecode(item) for item in headlines_without_in if item not in good_headlines]

In [32]:
len(headlines_case_three)

79

#### Headlines which won't yeild results

The following manual extraction was done since these cities don't meat any pattern.

In [33]:
cities_with_spaces = [city['name'] for city in gc.get_cities().values() if ' ' in city['name']]

In [34]:
len(cities_with_spaces)

5225

In [35]:
len(cities_with_spaces)

5225

In [36]:
big_p = ''
for city in cities_with_spaces:
    big_p = big_p + unidecode(city)
    big_p = big_p + '|'


In [37]:
re.search(big_p[:-1], 'Zika spreads to Kamphaeng Phet')

<re.Match object; span=(16, 30), match='Kamphaeng Phet'>

In [38]:
cities_case_three = []
good_headlines_3 = []
for headline in headlines_case_three:
    search = re.search(big_p[:-1], headline)
    if search is not None:
        cities_case_three.append(search.group())
        good_headlines_3.append(headline)

In [39]:
len(cities_case_three)

44

In [40]:
left_out_headlines = [item for item in headlines_case_three if item not in good_headlines_3]

In [41]:
len(left_out_headlines)

35

In [42]:
left_out = ['Geneva', 'Manila', 'New Orleans', 'New Delhi', 'Cucuta', 'Mecca', 'Columbus', 'Cancun', 'Cancun', 'Charlotte', 'Archus', 'Niteroi', 'Lafayette', 'Windsor', 'Valdosta', 'Jefferson City', 'None', 'Greenwich', 'Parsons', 'Yulee', 'Jefferson City', 'Brighton', 'Las Casas', 'Denver', 'Martinsville', 'Bonn', 'Zambaonga', 'Zanzibar', 'Fairfax County', 'Seminole', 'New York City', 'Kedougou', 'Davos', 'Pismo Beach', 'La Joya']

In [43]:
len(left_out)

35

## Creating the Dataframe

In [44]:
final_headlines = preliminary_headlines + good_headlines + good_headlines_3 + left_out_headlines

In [45]:
len(final_headlines)

650

In [46]:
final_cities = cleaned_cities_one + cities_second_case + cities_case_three + left_out

In [47]:
len(final_cities)

650

In [48]:
final_dict = {'Headline' : final_headlines, 'City': final_cities}

outbreak_df = pd.DataFrame.from_dict(final_dict)

In [49]:
outbreak_df

Unnamed: 0,Headline,City
0,First Case of Zika in Miami Beach,Miami Beach
1,"Mystery Virus Spreads in Recife, Brazil",Recife
2,Zika Concerns are Spreading in Houston,Houston
3,The CDC in Atlanta is Growing Worried,Atlanta
4,Zika Infested Monkeys in Sao Paulo,Sao Paulo
...,...,...
645,Stony Brook University experts discuss Zika virus,New York City
646,Kedougou tests new cure for Hepatitis C,Kedougou
647,Measles Hits Davos,Davos
648,Chikungunya has not Left Pismo Beach,Pismo Beach


## Getting Country Names

When extracting country names from the city name, there are three possibiliities:
<ul>
    <li>The city is found and is a single match.</li>
    <li>The city is found but contains multiple matches.</li>
    <li>The city is not found.
</ul>

A further possibility is that some cities have names with accents.

In [50]:
cities = list(outbreak_df['City'])

In [51]:
all_countries = gc.get_countries()
accented_names = [city['name'] for city in gc.get_cities().values() if city['name'] != unidecode(city['name'])]

In [52]:
def singular_country(possible_cities, countries):
    selected_city_dict = possible_cities[0]
    iterator = iter(selected_city_dict.keys())
    inner_dict = selected_city_dict[next(iterator)]
    ca = inner_dict["countrycode"]
    return countries[ca]['name']

In [53]:
def country_name(city_name, countries, accent_list):
    possible_cities = gc.get_cities_by_name(city_name)
    if len(possible_cities) == 1:
        return singular_country(possible_cities, countries)
    elif len(possible_cities) > 1:
        return multiple_countries(possible_cities, countries)
    else:
        return accented_city(city_name, countries, accent_list)
        
        

In [54]:
def multiple_countries(city_list, countries):
    best_sf = max(city_list,
              key=lambda x: list(x.values())[0]['population'])
    iterator = iter(best_sf.keys())
    inner_dict = best_sf[next(iterator)]
    ca = inner_dict["countrycode"]
    return countries[ca]['name']

In [55]:
def accented_city(city_name, countries, accented_cities):
    for city in accented_cities:
        if unidecode(city) == city_name:
            
            return country_name(city, countries, accented_cities)

        else:
            continue

In [56]:
ny = gc.get_cities_by_name('Miami Beach')[0]

In [57]:
for key in ny.keys():
    print(key)

4164143


In [58]:
country_name('Sao Paulo', all_countries, accented_names)

'Brazil'

The list of found countries.

In [59]:
found_countries = [country_name(city, all_countries, accented_names) for city in cities]

In [60]:
len(found_countries)

650

In [61]:
outbreak_df['Country'] = found_countries

In [62]:
outbreak_df.to_csv('outbreak_cities.csv')

In [63]:
list_of_nulls = len([item for item in found_countries if item is None])

In [64]:
list_of_nulls

73

## Dealing with not found country names

There are 73 cities which did not return a result from geonamescache, these will be filled in manually by searching on google and filling up on excel, the resulting csv will be the dataframe for submission.

In [70]:
final_df = pd.read_csv('outbreak_cities.csv', index_col=0)

In [72]:
final_df

Unnamed: 0,Headline,City,Country
0,First Case of Zika in Miami Beach,Miami Beach,United States
1,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
2,Zika Concerns are Spreading in Houston,Houston,United States
3,The CDC in Atlanta is Growing Worried,Atlanta,United States
4,Zika Infested Monkeys in Sao Paulo,Sao Paulo,Brazil
...,...,...,...
645,Stony Brook University experts discuss Zika virus,New York City,United States
646,Kedougou tests new cure for Hepatitis C,Kedougou,Senegal
647,Measles Hits Davos,Davos,Switzerland
648,Chikungunya has not Left Pismo Beach,Pismo Beach,United States
