# Parsing the News Headlines

# STEP 1

# IMPORTS

In [1]:
import pandas as pd
import numpy as np
import geonamescache
from collections import Counter
import unidecode
import re
import json

## DataFrame

In [2]:
# Read the text file and make a dataframe 

In [3]:
headlines = pd.read_csv('discovering-disease-outbreaks-base/data/headlines.txt',
                        header=None, 
                        delimiter='\n',
                        names=['Headlines'])



In [4]:
headlines.sample(5)

Unnamed: 0,Headlines
368,More Zika patients reported in Calumpang
362,Laventille authorities confirmed the spread of...
434,Case of Mad Cow Disease Reported in Hilden
218,Zika spreads to Daytona Beach
72,Hospitals in Hanoi fill up with Zika patients


In [5]:
# Check some informations about headlines
print("Number of headlines: {}".format(headlines.shape[0]))
print("Max lenght of headlines: {}".format(max([len(each[0]) for each in headlines.values])))
print("Min lenght of headlines: {}".format(min([len(each[0]) for each in headlines.values])))
print("Average lenght of headlines: {:.2f}".format(np.mean([len(each[0]) for each in headlines.values])))

Number of headlines: 650
Max lenght of headlines: 87
Min lenght of headlines: 16
Average lenght of headlines: 40.77


## Countries and Cities

In [6]:
# Init geonamecashe instance
gc = geonamescache.GeonamesCache()

In [7]:
# Extract the name of all the cities into a list
cities = [city['name'] for city in gc.get_cities().values()]
# Extract all the country names into a list
countries = [country['name'] for country in gc.get_countries().values()]

In [8]:
print("Total number of cities: {}".format(len(cities)))
print("Total number of countries: {}".format(len(countries)))

Total number of cities: 24336
Total number of countries: 252


In [9]:
# Find duplicate
city_counter = Counter(cities)
city_counter.most_common(10)

[('Springfield', 8),
 ('San Pedro', 7),
 ('Richmond', 7),
 ('San Fernando', 7),
 ('Mercedes', 6),
 ('La Paz', 6),
 ('Victoria', 6),
 ('San Francisco', 6),
 ('Auburn', 6),
 ('Santa Cruz', 6)]

In [10]:
# Remove accent marks
country_accent_mapping = {
    unidecode.unidecode(country): country for country in countries}

city_accent_mapping = {
    unidecode.unidecode(city): city for city in cities
}

In [11]:
# Clean the dataframe from accent marks
headlines_clean = pd.DataFrame(
    {
        "Headlines":[unidecode.unidecode(headline[0]) for headline in headlines.values]})

In [12]:
# Save cleaned cities and countries into a list
clean_cities = list(city_accent_mapping.keys())
clean_countries = set(country_accent_mapping.keys())

In [13]:
# Sort the city and country names
clean_cities = sorted(clean_cities, key=lambda x: len(x), reverse=True)
clean_countries = sorted(clean_countries, key=lambda x:len(x), reverse=True)

## Match countries

In [14]:
city_regex = r'\b|\b'.join(clean_cities)
country_regex = r'\b|\b'.join(clean_countries)

In [15]:
def find_country_and_city(headline):
    city = re.search(city_regex, headline)
    country = re.search(country_regex, headline)
    cities = None if not city else city.group(0)
    countries = None if not country else country.group(0)
    return dict(headline=headline, countries=countries, cities=cities)

In [16]:
headlines_extracted_cities_countries = [
    find_country_and_city(headline[0]) for headline in headlines.values
]

In [17]:
final_df = pd.DataFrame(headlines_extracted_cities_countries)

## Save files for later use

In [18]:
save_file = "headlines_with_city_country.json"
with open(save_file, 'w') as f:
    f.write(json.dumps(headlines_extracted_cities_countries))

In [19]:
with open("city_accent_mapping.json", 'w') as f:
    f.write(json.dumps(city_accent_mapping))
    
with open("country_accent_mapping.json", 'w') as f:
    f.write(json.dumps(country_accent_mapping))


# Step 2

In [80]:
def find_country_and_city_v2(headline):
    city_match = re.search(city_regex, headline)
    country_match = re.search(country_regex, headline)
    city = None if not city_match else city_match.group(0)
    country = None if not country_match else country_match.group(0)

    possible_city = sorted([each for each in gc.get_cities_by_name(city)], 
                                 key=lambda x:list(x.values())[0]['population'], 
                                 reverse=True)
    
    
    if len(possible_city) > 0:
        countrycode = list(possible_city[0].values())[0].get('countrycode')
        lat = list(possible_city[0].values())[0].get('latitude')
        lon = list(possible_city[0].values())[0].get('longitude')
        id = list(possible_city[0].values())[0].get('geonameid')  
        country=gc.get_countries().get(countrycode).get('name')
    
    else:
        lat = None
        lon = None
        id = None
        
    return dict(headline=headline, country=country, city=city, id=str(id), latitude=lat, longitude=lon)

In [81]:
headlines_extracted_cities_countries_v2 = [
    find_country_and_city_v2(headline[0]) for headline in headlines.values
]

In [83]:
headlines_v2 = pd.DataFrame(headlines_extracted_cities_countries_v2).sample(10)