In [4]:
#Install pip install geonamescache
!pip install geonamescache



In [5]:
import numpy as np
import pandas as pd
import re
import geonamescache
import unidecode

In [6]:
dataset=[headline.strip() for headline in open('headlines.txt','r')]
dataset[:4]

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil']

Get country names

In [7]:
gc =geonamescache.GeonamesCache()
countries =[country['name'] for country in gc.get_countries().values()]
countries[:10]

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina']

In [8]:
cities =[city['name'] for city in gc.get_cities().values()]
cities[:10]

['Andorra la Vella',
 'Umm Al Quwain City',
 'Ras Al Khaimah City',
 'Zayed City',
 'Khawr Fakkān',
 'Dubai',
 'Dibba Al-Fujairah',
 'Dibba Al-Hisn',
 'Sharjah',
 'Ar Ruways']

### Remove duplicate values if any

In [9]:
countries=list(set(countries))
len(countries)

252

In [10]:
len(cities)

25286

In [11]:
cities=list(set(cities))
len(cities)

23958

## Removing Accent Marks

In [12]:
# source: solution
country_accent_mapping = {
    unidecode.unidecode(country): country for country in countries
}

city_accent_mapping = {
    unidecode.unidecode(city): city for city in cities
}
city_accent_mapping["Asmar"]

'Āsmār'

Remove accept marks for headlines as well

In [13]:
df_data = [unidecode.unidecode(headline) for headline in dataset]
df_data[-10:]

['Authorities are Worried about the Spread of Varicella in Clovis',
 'More Zika patients reported in Fort Worth',
 'Zika symptoms spotted in Boynton Beach',
 'Outbreak of Zika in Portoviejo',
 'Influenza Exposure in Muscat',
 'Rumors about Rabies spreading in Jerusalem have been refuted',
 'More Zika patients reported in Indang',
 'Suva authorities confirmed the spread of Rotavirus',
 'More Zika patients reported in Bella Vista',
 'Zika Outbreak in Wichita Falls']


# Total Cities and Countries

In [14]:
# Create list of cities and countries
unaccented_cities = list(city_accent_mapping.keys())
unaccented_countries = set(country_accent_mapping.keys())

print("Total countries = ",len(unaccented_countries))
print('Total cities = ',len(unaccented_cities))

Total countries =  252
Total cities =  23888


###  Regular Expressions

We construct the regular expressions by joining together the list of strings. The words are separated with a `|` for the or symbol in a regular expression. We also use the `\b` tag to make sure to match on entire words (beginning and end.)

In [15]:
city_regex = r'\b|\b'.join(unaccented_cities)
city_regex[1500:1800]

"Pensacola\\b|\\bP'ot'i\\b|\\bLobatse\\b|\\bFaaa\\b|\\bBamusso\\b|\\bPort Harcourt\\b|\\bMakum\\b|\\bJeremie\\b|\\bCairns\\b|\\bQuellon\\b|\\bAtimonan\\b|\\bDivnoye\\b|\\bHuixquilucan\\b|\\bKumla\\b|\\bGevrai\\b|\\bJarash\\b|\\bSchiebroek\\b|\\bSopron\\b|\\bTashkent\\b|\\bSao Paulo\\b|\\bPalmar de Varela\\b|\\bNiagara\\b|\\bSalamanca\\b|\\bArden"

In [16]:
# Apply regs to city 
np.random.seed(50)

test_headlines = np.random.choice(dataset, 10)

for test_headline in test_headlines:
    print(test_headline)
    match = re.search(city_regex, test_headline)
    if match:
        print(match.group(0), "\n")

More Zika patients reported in Custodia
Custodia 

Tokyo Encounters Severe Symptoms of Meningitis
Tokyo 

Zika Troubles come to Kampong Cham
Kampong Cham 

19 new Zika Cases in Sengkang
Sengkang 

Mumbai's Health Minister warns of more Zika cases
Mumbai 

Varicella re-emerges in Lagos
Lagos 

Mumbai's Health Minister warns of more Zika cases
Mumbai 

Milwaukee authorities confirmed the spread of Rhinovirus
Milwaukee 

Zika cases concern Charlotte residents
Charlotte 

Four cases of Zika in Hidalgo County
Hidalgo 



In [17]:
country_regex = r"\b|\b".join(unaccented_countries)
country_regex[:100]

'Niue\\b|\\bU.S. Virgin Islands\\b|\\bMyanmar\\b|\\bNew Zealand\\b|\\bGuam\\b|\\bTuvalu\\b|\\bMartinique\\b|\\bBelg'

In [18]:
np.random.seed(100)
test_headlines = np.random.choice(dataset, 10)

for test_headline in test_headlines:
    print(test_headline)
    match = re.search(country_regex, test_headline)
    if match:
        print(match.group(0), "\n")

Longwood volunteers spreading Zika awareness
More Zika cases in Soyapango
Spike of Dengue Cases in Stockholm
Case of Measles Reported in Vancouver
Zika arrives in Belmopan
Outbreak of Zika in Colombo
Zika symptoms spotted in Arlington
Malaria re-emerges in Boise
Southampton Patient in Critical Condition after Contracting Tuberculosis
Manassas Encounters Severe Symptoms of Measles


In [19]:
test_headline = dataset[3]
print(test_headline)
print(re.search(city_regex, test_headline).group(0))
print(re.search(country_regex, test_headline).group(0))

Mystery Virus Spreads in Recife, Brazil
Recife
Brazil


In [20]:
def map_city_and_country_in_headline(dataset):
    df_headline=[]
    for headline in dataset:
        city_match = re.search(city_regex, headline)
        country_match = re.search(country_regex, headline)
        cities = None if not city_match else city_match.group(0)
        countries = None if not country_match else country_match.group(0)
        df_headline.append(dict(headline=headline, countries=countries, cities=cities))
    
    return df_headline

In [21]:
map_city_and_country_in_headline_all=map_city_and_country_in_headline(dataset)

## Saving data as a json format


In [22]:
import json

filename = "headline_cities_and_countries.json"
with open(filename, "w") as fout:
    fout.write(json.dumps(map_city_and_country_in_headline_all))

# read file
with open("headline_cities_and_countries.json", "r") as fin:
    check_data = json.loads(fin.read())
check_data[:10]

[{'headline': 'Zika Outbreak Hits Miami',
  'countries': None,
  'cities': 'Miami'},
 {'headline': 'Could Zika Reach New York City?',
  'countries': None,
  'cities': 'New York City'},
 {'headline': 'First Case of Zika in Miami Beach',
  'countries': None,
  'cities': 'Miami Beach'},
 {'headline': 'Mystery Virus Spreads in Recife, Brazil',
  'countries': 'Brazil',
  'cities': 'Recife'},
 {'headline': 'Dallas man comes down with case of Zika',
  'countries': None,
  'cities': 'Dallas'},
 {'headline': 'Trinidad confirms first Zika case',
  'countries': None,
  'cities': 'Trinidad'},
 {'headline': 'Zika Concerns are Spreading in Houston',
  'countries': None,
  'cities': 'Houston'},
 {'headline': 'Geneve Scientists Battle to Find Cure',
  'countries': None,
  'cities': 'Geneve'},
 {'headline': 'The CDC in Atlanta is Growing Worried',
  'countries': None,
  'cities': 'Atlanta'},
 {'headline': 'Zika Infested Monkeys in Sao Paulo',
  'countries': None,
  'cities': 'Sao Paulo'}]

## Loading data as a DataFrame

In [23]:
df_headlines=pd.read_json('headline_cities_and_countries.json')
df_headlines.head(10)

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo


In [24]:
# replace None to NaN
df_headlines=df_headlines.replace({None:np.nan})
df_headlines

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
...,...,...,...
645,Rumors about Rabies spreading in Jerusalem hav...,,Jerusalem
646,More Zika patients reported in Indang,,Indang
647,Suva authorities confirmed the spread of Rotav...,,Suva
648,More Zika patients reported in Bella Vista,,Bella Vista


### Add country names using city names

In [25]:
cities =[{city['name'], city['countrycode']} for city in gc.get_cities().values()]

df_cities = pd.DataFrame.from_records(cities,columns=['City','Iso'])
print(df_cities.head())
df_cities.shape

                  City                 Iso
0                   AD    Andorra la Vella
1                   AE  Umm Al Quwain City
2  Ras Al Khaimah City                  AE
3                   AE          Zayed City
4         Khawr Fakkān                  AE


(25286, 2)