# Chapter1

#### import libraries

In [1]:
import numpy as np
import pandas as pd 
import re
import geonamescache
import unidecode
import json

#### import all headlines for the file

In [2]:
f = open('./data/headlines.txt',"r")
headlines = f.read().splitlines()
f.close()

#### create dictionaries for countries and cities

In [3]:
gc = geonamescache.GeonamesCache()

countries = [country["name"] for country in gc.get_countries().values()]
cities = [city["name"] for city in gc.get_cities().values()]
allcountries = gc.get_countries()

#### create mappings between accented and unicode countries and city names

In [4]:
country_accent_mapping = {
    unidecode.unidecode(country): country for country in countries
}

city_accent_mapping = {
    unidecode.unidecode(city): city for city in cities
}

##### create mappings between country code and country name 

In [5]:
country_to_code = {
    allcountries[country]['name']: country for country in allcountries
}

code_to_country = {
    country: allcountries[country]['name'] for country in allcountries
}

#### sort countries and cities by name lenght

In [6]:
unaccented_cities = list(city_accent_mapping.keys())
unaccented_countries = list(country_accent_mapping.keys())

unaccented_cities.sort(key=lambda item: (-len(item), item), reverse=False)
unaccented_countries.sort(key=lambda item: (-len(item), item), reverse=False)

#### create regexes for coutries and city names

In [7]:
_cities = "|".join(unaccented_cities)
regexCity = "\\b(" + _cities + ")\\b"
compiledReCity = re.compile(regexCity)

_countries = "|".join(unaccented_countries)
regexCountry = "\\b(" + _countries + ")\\b"
compiledReCountry = re.compile(regexCountry)

#### create function extracting city and country from a headline

In [8]:
def city_and_country_from_a_headline(headline):
    
    '''
    find city name and country name in a given headline
    
    input: headline (string)
    output: mapping the headline to city(s) and/or countries (dict)
    
    '''
    
    latitude = None
    longitude = None
    
    # country found
    country_match = compiledReCountry.search(headline)
    if country_match:
        country = country_match.group(0)
    else:
        country = None
    
    # city found
    city_match = compiledReCity.search(headline)
    if city_match:
        city = city_match.group(0)
        # look up a country based on a accented city name
        _cities = gc.get_cities_by_name(city_accent_mapping[city])
        biggestCity = {}
        for c in _cities:
            for cityKey in c:
#                 print(cityKey)
                if len(biggestCity) == 0:
                    biggestCity = c[cityKey]
                else:
                    if c[cityKey]['population'] > biggestCity['population']:
                        biggestCity = c[cityKey] 
#         print(biggestCity)
        if not country_match:
            country = code_to_country[biggestCity['countrycode']]
        latitude = biggestCity['latitude']
        longitude = biggestCity['longitude']
    else:
        city = None
    
    return dict(headline=headline, country=country, city=city, latitude=latitude, longitude=longitude)    

#### test to see results for first 100 headlines

In [9]:
headline_cities_and_countries = [
    city_and_country_from_a_headline(headline) for headline in headlines
]
headline_cities_and_countries[-100:]

[{'headline': 'Zika case reported in Limoeiro',
  'country': 'Brazil',
  'city': 'Limoeiro',
  'latitude': -7.87472,
  'longitude': -35.45028},
 {'headline': 'Ibadan tests new cure for Malaria',
  'country': 'Nigeria',
  'city': 'Ibadan',
  'latitude': 7.37756,
  'longitude': 3.90591},
 {'headline': 'Gonorrhea has Arrived in Avon Lake',
  'country': 'United States',
  'city': 'Avon Lake',
  'latitude': 41.50532,
  'longitude': -82.0282},
 {'headline': 'Pneumonia has not Left Kinshasa',
  'country': 'Democratic Republic of the Congo',
  'city': 'Kinshasa',
  'latitude': -4.32758,
  'longitude': 15.31357},
 {'headline': 'Respiratory Syncytial Virus Hits Henderson',
  'country': 'United States',
  'city': 'Henderson',
  'latitude': 36.0397,
  'longitude': -114.98194},
 {'headline': 'More Zika patients reported in Lakeland',
  'country': 'United States',
  'city': 'Lakeland',
  'latitude': 28.03947,
  'longitude': -81.9498},
 {'headline': 'Malaria Vaccine is now Required in Winona',
  'cou

##### save to  json

In [10]:
save_file = "./data/headline_cities_and_countries.json"
with open(save_file, "w") as fout:
    fout.write(json.dumps(headline_cities_and_countries))

#### save mappings

In [11]:
with open("./data/country_accent_mapping.json", "w") as fout:
    fout.write(json.dumps(country_accent_mapping))
with open("./data/city_accent_mapping.json", "w") as fout:
    fout.write(json.dumps(city_accent_mapping))
with open("./data/country_to_code.json", "w") as fout:
    fout.write(json.dumps(country_to_code))
with open("./data/code_to_country.json", "w") as fout:
    fout.write(json.dumps(code_to_country))

#### read df from json and print 10 first rows

In [13]:
import pandas as pd

data = pd.read_json("./data/headline_cities_and_countries.json")
data = data.replace({None: np.nan})

data.head(10)

Unnamed: 0,headline,country,city,latitude,longitude
0,Zika Outbreak Hits Miami,United States,Miami,25.77427,-80.19366
1,Could Zika Reach New York City?,United States,New York City,40.71427,-74.00597
2,First Case of Zika in Miami Beach,United States,Miami Beach,25.79065,-80.13005
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife,-8.05389,-34.88111
4,Dallas man comes down with case of Zika,United States,Dallas,32.78306,-96.80667
5,Trinidad confirms first Zika case,Bolivia,Trinidad,-14.83333,-64.9
6,Zika Concerns are Spreading in Houston,United States,Houston,29.76328,-95.36327
7,Geneve Scientists Battle to Find Cure,Switzerland,Geneve,46.20222,6.14569
8,The CDC in Atlanta is Growing Worried,United States,Atlanta,33.749,-84.38798
9,Zika Infested Monkeys in Sao Paulo,Brazil,Sao Paulo,-23.5475,-46.63611


# Chapter2