In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
! pip install unidecode geonamescache

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 3.5MB/s 
[?25hCollecting geonamescache
[?25l  Downloading https://files.pythonhosted.org/packages/32/c1/efb823270c8526b2f4f3eb8c804c5a0a55277267ad2312f5eb47bd9cc370/geonamescache-1.1.0-py3-none-any.whl (830kB)
[K     |████████████████████████████████| 839kB 7.7MB/s 
[?25hInstalling collected packages: unidecode, geonamescache
Successfully installed geonamescache-1.1.0 unidecode-1.1.1


# Discovering Disease Outbreaks from News Headlines

Identifying and mapping epidemics is crucial to prevent or respond to deadly disease outbreaks. Your first assignment for the WHO is as follows:

- Extract the locations (city and/or country name) from each news headline.
- Find the geographic coordinates of each headline using the city/country.
- Cluster (group) the headlines based on the geographic location.
- Visualize the clusters on a map and analyze them for patterns indicating an epidemic.
- Investigate the largest clusters for signs of disease outbreaks.
- Review headlines in the largest clusters within the United States and around the world. If any disease outbreak is   particularly dominant, visualize all worldwide mentions of that disease.
- Provide a summary of your findings to your superiors at the WHO so they can direct resources.

# Parsing the News Headlines

- **Objective**

    Find any city and/or country names mentioned in each of the news headlines.

- **Workflow**

    1. Load in the headline data and examine it for any data quality issues.
    1. Use any library/data structure to read in the headlines
    1. Read through some of the headlines and identify potential problems
    1. Using regular expressions and the cities and countries within the geonamescache library, match any cities/countries within each headline.
    1. Make sure to normalize headlines and city/country names by removing accent marks. This can be done with the unidecode library.
    1. Watch out for multiple cities in a headline and matches on short words! We want the match to be on the entire city—for example San Marino—and not a partial match—San.
    1. Put the extracted data into a pandas DataFrame with three columns: headline, city, country.
    1. Make sure there were no issues with the extraction by sampling some of the headlines and examining the city and country names.
    1. One method for finding problems is to look for the most common names and see if there are any issues.
    1. Once you are confident you’ve found all the cities/countries in each headline, save the DataFrame for the next part.


* **Importance to project**

 We can’t do much with just the headlines; although they contain the city/country names, they do not contain the geographic information—latitude and longitude—we need to find clusters of disease outbreaks. The first step in getting the geographic information is to isolate the names.

 Later, we will use the names to find the location of each headline, which requires bringing in external data (through geonamescache).

 This workflow is common in data science. First, we separate the useful information from the noise—data mining—and then we augment it with external data—data engineering.

## Import all relevent libraries

In [3]:
import pandas.util.testing as tm
# Regular expression
import re

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Data analysis and wrangling
import numpy as np
import pandas as pd

# Set display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 200)

# Geoname data
import geonamescache
# Normalized unicode data (to remove accents)
import unidecode
 
## Visualization
# matplotlib
import matplotlib.pyplot as plt
# get_ipython().magic('matplotlib inline')
import seaborn as sns

# Ignore warning
import warnings
warnings.filterwarnings('ignore')

  """Entry point for launching an IPython kernel.


## **Functions**

### Load dataset and create a dataframe function

In [0]:
def headline_df():
    '''
    import headline.csv and create DataFrame
    '''
    # Read and unidecode dataset 
    file_contents = ""
    path = '/content/drive/My Drive/Data science/A  Python - Data Science Practices/Discovering Disease Outbreaks from News Headlines/headlines.txt'
    with open(path, "r") as file_handle:
        for line in file_handle.readlines():
            file_contents += line.replace("-", " ")
            unidecode.unidecode(line)

    # Create dataframe of headlines
    df = pd.DataFrame(file_contents.split('\n'),columns=['headline'])        
    
    print(f'Import a DataFrame of news headlines successfully!','\n')
    print(df.info(),'\n')
    print(f'*** Value counts: Top fives ***\n{df.headline.value_counts()[:5]}','\n')
    display(df.head())
    
    return df

### Functions to pull ref. city and country names from GeonamesCache

In [0]:
def cities():
    '''
    '''
    gc = geonamescache.GeonamesCache()
    
    # Retrive city names data and create a dataframe.
    cities = pd.DataFrame(gc.get_cities()).T.reset_index(drop=True)
    cities = cities.sort_values(by='name').reset_index(drop=True)

    # transform all accented strings to English alphabets'
    for index, city in enumerate(cities.loc[:, 'name']):
        cities.loc[index, 'name'] = unidecode.unidecode(city)
    # Rename name collumn    
    cities.rename({'name':'city'},axis=1,inplace=True)        
    # View dataframe    
    print(f'cities shape = {cities.shape}')
    display(cities.sample(5))
    print('\n')    
    
    return cities

def countries():
    '''
    '''
    gc = geonamescache.GeonamesCache()
    # Retrive country names data and create a dataframe.
    countries = pd.DataFrame(gc.get_countries_by_names()).T.reset_index(drop=True)
    countries = countries.sort_values(
        by='name', ascending=False).reset_index(drop=True)
    # countries.info()
    # Rename name collumn    
    countries.rename({'name':'country'},axis=1,inplace=True)
    # View dataframe
    print(f'countries shape = {countries.shape}')
    display(countries.head())
    print('\n')
    
    return countries

def states():
    '''
    '''
    gc = geonamescache.GeonamesCache()
    # Retrive country names data and create a dataframe.
    states = pd.DataFrame(gc.get_us_states_by_names()).T.reset_index(drop=True)
    states = states.sort_values(by='name', ascending=False).reset_index(drop=True)
    # Rename name collumn    
    states.rename({'name':'state'},axis=1,inplace=True)    
    # View dataframe
    print(f'states shape = {states.shape}')
    display(states.head())
    print('\n')
    
    return states


def counties():
    '''
    '''
    gc = geonamescache.GeonamesCache()
    # Retrive county names data and create a dataframe.
    counties = pd.DataFrame(gc.get_us_counties()).T.reset_index(drop=True).T
    # Rename columns
    counties.columns = ['code', 'name', 'state']

    counties = (counties.sort_values(by='state', 
                                     ascending=False
                                    )
                    .reset_index(drop=True
                                )
               )
    # Remove general suffixs from county names have only certain names
    s = ['County', 'Municipio', 'Island', 'Census Area',
         'City and Borough', 'Borough', 'Parish']
    regexs = '|'.join(s)

    column = []
    co = []
    counties['county'] = pd.Series()

    for county in counties.name:
        if type(counties['county']) != str:
            compiled_uscounty = re.compile(regexs)
            name_only = compiled_uscounty.sub('', county)
            co.append(name_only)
            column.append(co)

    counties['county'] = pd.Series(co)
    counties = counties.sort_values(by='state')
    
    # View dataframe
    print(f'counties shape = {counties.shape}')
    display(counties.head())
    print('\n')
    
    return counties

def us_city():
    '''
    '''
    state_dict = dict(zip(states.code,states.name))

    us_city = (cities[cities.countrycode == 'US'][['name', 'latitude', 'longitude', 'countrycode',
                                                  'population', 'admin1code']]
                            .rename({'admin1code':'us_state', 'name':'city'},axis=1
                                   )
              )
    us_city['statecode'] = us_city.us_state.values

    us_city.us_state = us_city.us_state.replace(state_dict)

    us_city = us_city[['city','us_state','population','countrycode', 'statecode','latitude', 'longitude']]    

    print(f'us_city shape = {us_city.shape}')
    display(us_city.head())
    print('\n')
    
    return us_city

### Data manipulating functions

In [0]:
def flatten(xs):
    '''
    flatten a potentially deep list using recursion.
    '''
    flat_list = []
    [flat_list.extend(flatten(x)) for x in xs] if isinstance(xs, list) else flat_list.append(xs)
    return flat_list

def difference(a, b):
    '''
    finds the difference between two iterables
    by keeping only the values that are in the first one.
    '''
    set_a = set(a)
    set_b = set(b)
    comparison = set_a.difference(set_b)
    return list(comparison)

def var_name(**variables):
    'retrive variable name '        
    return [x for x in variables]

### Styling pandas dataframe functions

In [0]:
def highlight_max(data, color='yellow'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        city_county = pd.concat([data.city,data.county])
        names = pd.concat([city_county,data.state]).dropna()
        names == names.max()
        is_max = names == names.max() 
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)

def highlight_len_max(data, color='lightgreen'):
    '''
    highlight the maximum in a Series or DataFrame
    '''
    attr = 'background-color: {}'.format(color)
    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
        is_max = data == data.max()
        return [attr if v else '' for v in is_max]
    else:  # from .apply(axis=None)
        is_max = data == data.max().max()
        return pd.DataFrame(np.where(is_max, attr, ''),
                            index=data.index, columns=data.columns)        
    
def color_null_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for NaN
    , black otherwise.
    """
    color = 'red' if str(val) == 'nan' else 'white'
    return 'color: %s' % color    

### Function for matching city names in news headlines using regex

In [0]:
def prep_regex():
    '''
    Prepare regexs from names
    return tuple of four lists
    (expr_regexs, city_regexs, state_regexs, county_regexs)
    '''

    a = '*' * 100

    # list of instants with no word boundaries '\b'
    expr_regexs = []  # Some names are in brackets and after slash signs.

    # below are lists of instants for short words matching
    city_regexs = []  # Some names are in brackets and after slash signs.
    state_regexs = []
    county_regexs = []
    country_regexs = []

    to_split = []  # list of values that needed to be extracted

    for city in cities.city:
        expr_regexs.append(city.strip())
        city_regexs.append('\\b' + city.strip() + '\\b')
        if ('(' in city) or (
                '/' in city
        ):  # filtering values that have bracket or slash signs
            to_split.append(city)


#     print(f"City names in brackets and after slash signs.\n\t  {drop_val[:10]}")

# Need to add boundary '\\b' after value spliting
    bracket = {}  # dict of names inside the brackets.
    slash = {}  # dict of names after a slash

    # No need '\\b' after value spliting
    sub_bracket = {}  # dict of names in front of the brackets.
    sub_slash = {}  # dict of names in front of the a slash
    sub_bk_sl = {}  # dict of names after a slash that comes after a bracket

    for state in states.state:
        state_regexs.append('\\b' + state.strip() + '\\b')

    for county in counties.county:
        county_regexs.append('\\b' + county.strip() + '\\b')

    for country in countries.country:
        country_regexs.append('\\b' + country.strip() + '\\b')

    # Summary of Regexs preparation:
    print(f'{a}\nPreparing Regex instances has completed.......\n{a}\n')

    result = {
        'expr': expr_regexs,
        'city': city_regexs,
        'state': state_regexs,
        'county': county_regexs,
        'country': country_regexs
    }

    return result['expr'], result['city'], result['state'], result[
        'county'], result['country']


def extract_names(regex):
    '''
    extract city names in bractket and after slash

    '''
    bracket = {}
    sub_bracket = {}
    slash = {}
    sub_slash = {}
    sub_bk_sl = {}

    # for v in expr_regexs:
    for v in regex:
        if '/' in v:
            # extract words after slashes
            slash[v] = re.findall('\s*/\s*(\w*\s*\w*)', v)
            # extract words after slashes
            sub_slash[v] = re.sub('\s*/\s*(\w*\s*\w*)', '', v)

        if '(' in v:
            # extract words in brackets
            bracket[v] = re.findall('\s*\((\w*-*\s*\w+\d*\D*)\)', v)
            # extract words in brackets
            sub_bracket[v] = re.sub('\s*\((\w*-*\s*\w+\d*\D*)\)', '', v)

        if ('/' in v) and ('(' in v):
            # extract words after a slash that comes after a bracket.
            sub_bk_sl[v] = re.sub('\s*\((\w*\s*\w*\w*\D*\)\s*/\s*\w*)', '', v)

    # create blank list for unique names extracted from values in brackets  and after slashes
    extracted_names = []

    # Append values from bracket and slash lists to extracted_names list
    for v in [
            bracket, slash
    ]:  # the values in this couple lists need to add the boundary expression

        # means that the argument is city_regexs list
        if '\\b' in list(v.keys())[0]:
            [
                extracted_names.append('\\b' + word + '\\b')
                for word in list(set(flatten(list(v.values()))))
                if ('(' not in word) and ('/' not in word)
            ]

        else:  # means that the argument is exper_regex list
            [
                extracted_names.append(word)
                for word in list(set(flatten(list(v.values()))))
                if ('(' not in word) and ('/' not in word)
            ]

    # Append values from another three remaining lists to extracted_names list
    [
        extracted_names.append(v) for v in list(
            set(
                flatten([
                    list(sub_bracket.values()),
                    list(sub_slash.values()),
                    list(sub_bk_sl.values())
                ]))) if ('(' not in v) and ('/' not in v)
    ]

    global city_regexs, expr_regexs

    # Add values back to the regex list.
    [city_regexs.append(v) for v in extracted_names if '\\b' in v]
    [expr_regexs.append(v) for v in extracted_names if '\\b' not in v]


def search_headline(regex_dict):
    '''Match names in headlines using regex_dict'''
    
    a = '*' * 100

    # regex_dict = {'expr':expr_regexs, 'city':city_regexs, 'state':state_regexs, 'county':county_regexs}

    item = list(regex_dict.items())

    print(f'{a}\nMatching in progress.......\n{a}\nTotal headlines = 648\n')

    result = []

    for n in range(len(item)):
        regex_name, regex_list = item[n]

        ind = []
        name = []
        hline = []

        excluded_result = []

        for regex in regex_list:
            compiled_city = re.compile(regex, flags=re.IGNORECASE)

            for index, headline in enumerate(df.headline):
                match = compiled_city.search(headline)

                if match is not None:
                    start, end = match.start(), match.end()
                    matched_string = headline[start:end]

                    if matched_string.istitle():
                        ind.append(index)
                        name.append(matched_string)
                        hline.append(headline)
                    else:
                        excluded_result.append(
                            [index, matched_string, headline])
                        # print(index,matched_string, '<<<', headline)

        # Create dataframe of matched results and sort values by item_no
        matched = {'item_no': ind, 'headline': hline, str(regex_name): name}
        matched_name = pd.DataFrame(matched)
        matched_name = matched_name.sort_values(by='item_no').reset_index(
            drop=True)
        result.append(matched_name)

        print(
            f"The {regex_name} regexs output had total of {matched_name[str(regex_name)].notnull().sum()} matches of {len(set(matched_name[str(regex_name)]))} different names\n"
        )
        display(matched_name.head())

    print(f'{a}\nMatching has completed.......\n{a}\n')

    return tuple(result)


def merge_names(tuple_of_df):
    '''merge all matched name results
    arg = a list of dataframes
    output = a merged dataframe 
    '''
    df, city, state, county, country = tuple_of_df
    # df of headlines as a baseline
    df.reset_index(drop=False, inplace=True)
    # rename collumns to be consistant to output df
    df.rename({'index': 'item_no'}, axis=1, inplace=True)
    # Join matched results from city
    cityname = df.merge(city,
                        on=['headline', 'item_no'],
                        suffixes=('', '_city'),
                        how='outer').drop_duplicates().reset_index(drop=True)
    # Join matched results from the US state and county df
    usa = county.drop_duplicates().merge(state.drop_duplicates(),
                                         on=['headline', 'item_no'],
                                         suffixes=('county', '_state'),
                                         how='outer')
    result = (cityname.merge(
        usa, on=['headline', 'item_no'], suffixes=('', '_usa'),
        how='outer').drop_duplicates().reset_index(drop=True).merge(
            country,
            on=['headline', 'item_no'],
            suffixes=('', '_country'),
            how='outer').drop_duplicates().reset_index(drop=True))
    return result


def remove_incompat(df):
    '''remove all incompatible names that caused the multiple matched names for each headline'''
    dropped = {}
    kept = {}
    city_df = df.copy()
    headlines = (df.headline.value_counts().index.tolist())
    for hl in headlines:
        df_city = df[df.headline == hl]
        if (len(df_city) > 1) and (df_city.city.isnull().sum() !=
                                   len(df_city)):
            nan_idx = df_city.city[df_city.city.isnull() ==
                                   True].index.tolist()
            df_city.drop(nan_idx, axis=0, inplace=True)
            city_df.drop(nan_idx, axis=0, inplace=True)
            for i, v in list(zip(df_city.index, df_city.city)):
                if len(v) == max(df_city.city.str.len()):
                    kept[i] = v, 'city'
                    continue
                else:
                    dropped[i] = v, 'city'
                    city_df.drop(i, axis=0, inplace=True)
        else:
            continue

    headline = (city_df.headline.value_counts().index.tolist())

    for hl in headline:
        df_county = city_df[city_df.headline == hl]
        if (len(df_county) > 1) and (df_county.county.isnull().sum() !=
                                     len(df_county)):
            nan_idx = df_county.county[df_county.county.isnull() ==
                                       True].index.tolist()
            df_county.drop(nan_idx, axis=0, inplace=True)
            city_df.drop(nan_idx, axis=0, inplace=True)
            for i, v in list(zip(df_county.index, df_county.county)):
                if len(v) == max(df_county.county.str.len()):
                    kept[i] = v, 'county'
                    continue
                else:
                    dropped[i] = v, 'county'
                    city_df.drop(i, axis=0, inplace=True)
        else:
            continue

    city_df.reset_index(drop=True, inplace=True)

    return city_df

## Load DataFrames of cities, countries  from GeonamesCache

In [10]:
cities = cities()

counties = counties()

states = states()

countries = countries()


cities shape = (24336, 8)


Unnamed: 0,geonameid,city,latitude,longitude,countrycode,population,timezone,admin1code
20048,2786344,Soumagne,50.6138,5.74679,BE,15237,Europe/Brussels,WAL
24072,2624652,Arhus,56.1567,10.2108,DK,237551,Europe/Copenhagen,18
15810,4717232,Palestine,31.7621,-95.6308,US,18288,America/Chicago,TX
12394,590447,Maardu,59.4653,24.9822,EE,16630,Europe/Tallinn,01
15795,3114566,Palamos,41.8484,3.12912,ES,18161,Europe/Madrid,56




counties shape = (3234, 4)


Unnamed: 0,code,name,state,county
3233,2070,Dillingham Census Area,AK,Dillingham
3205,2180,Nome Census Area,AK,Nome
3206,2290,Yukon-Koyukuk Census Area,AK,Yukon-Koyukuk
3207,2282,Yakutat City and Borough,AK,Yakutat
3208,2275,Wrangell City and Borough,AK,Wrangell




states shape = (51, 4)


Unnamed: 0,code,state,fips,geonameid
0,WY,Wyoming,56,5843591
1,WI,Wisconsin,55,5279468
2,WV,West Virginia,54,4826850
3,WA,Washington,53,5815135
4,VA,Virginia,51,6254928




countries shape = (252, 17)


Unnamed: 0,geonameid,country,iso,iso3,isonumeric,fips,continentcode,capital,areakm2,population,tld,currencycode,currencyname,phone,postalcoderegex,languages,neighbours
0,878675,Zimbabwe,ZW,ZWE,716,ZI,AF,Harare,390580,13061000,.zw,ZWL,Dollar,263,,"en-ZW,sn,nr,nd","ZA,MZ,BW,ZM"
1,895949,Zambia,ZM,ZMB,894,ZA,AF,Lusaka,752614,13460305,.zm,ZMW,Kwacha,260,^(\d{5})$,"en-ZM,bem,loz,lun,lue,ny,toi","ZW,TZ,MZ,CD,NA,MW,AO"
2,69543,Yemen,YE,YEM,887,YM,AS,Sanaa,527970,23495361,.ye,YER,Rial,967,,ar-YE,"SA,OM"
3,2461445,Western Sahara,EH,ESH,732,WI,AF,El-Aaiun,266000,273008,.eh,MAD,Dirham,212,,"ar,mey","DZ,MR,MA"
4,4034749,Wallis and Futuna,WF,WLF,876,WF,OC,Mata Utu,274,16025,.wf,XPF,Franc,681,^(986\d{2})$,"wls,fud,fr-WF",






## Load outbreak news headline dataframe

In [11]:
df = headline_df()

# View duplicated headlines
for headline in df.headline.value_counts().index.tolist()[:2]:
    display(df[df.headline == headline])

Import a DataFrame of news headlines successfully! 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  650 non-null    object
dtypes: object(1)
memory usage: 5.2+ KB
None 

*** Value counts: Top fives ***
Spanish Flu Spreading through Madrid       2
Spanish Flu Outbreak in Lisbon             2
Flu season hits Boston                     1
Zika case reported in Rizal                1
Tourist Perishes from Malaria in Arusha    1
Name: headline, dtype: int64 



Unnamed: 0,headline
0,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika


Unnamed: 0,headline
484,Spanish Flu Spreading through Madrid
487,Spanish Flu Spreading through Madrid


Unnamed: 0,headline
483,Spanish Flu Outbreak in Lisbon
486,Spanish Flu Outbreak in Lisbon


In [12]:
# Drop duplicated rows
df = df.drop_duplicates('headline')
print(f'*** Headline value counts: Top fives ***\n{df.headline.value_counts()[:5]}','\n')
df.info()

*** Headline value counts: Top fives ***
Providence Encounters Severe Symptoms of Dengue     1
Rotavirus Vaccine is now Required in Las Cumbres    1
Zika spreads to Daytona Beach                       1
Tourist Perishes from Malaria in Arusha             1
Mad Cow Disease Spreads to Margate                  1
Name: headline, dtype: int64 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 648 entries, 0 to 649
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  648 non-null    object
dtypes: object(1)
memory usage: 10.1+ KB


## Prepare city names regex instances from GeonamesCache and match to city names in news headlines

In [15]:
# Prepare regexs from names by unpack tuple from prep_regex function
expr_regexs, city_regexs, state_regexs, county_regexs, country_regexs = prep_regex()

# Extract names from values that have brackets and slash signs
extract_names(city_regexs)
extract_names(expr_regexs)

regex_dict = {'expr':expr_regexs, 'city':city_regexs, 'state':state_regexs, 'county':county_regexs, 'country':country_regexs}

# Matching names using regular expressions 
expr, city, state, county, country = search_headline(regex_dict)


****************************************************************************************************
Preparing Regex instances has completed.......
****************************************************************************************************

****************************************************************************************************
Matching in progress.......
****************************************************************************************************
Total headlines = 648

The expr regexs output had total of 1505 matches of 712 different names



Unnamed: 0,item_no,headline,expr
0,0,Zika Outbreak Hits Miami,Miami
1,0,Zika Outbreak Hits Miami,Hit
2,1,Could Zika Reach New York City?,New York City
3,1,Could Zika Reach New York City?,York
4,1,Could Zika Reach New York City?,York


The city regexs output had total of 1057 matches of 607 different names



Unnamed: 0,item_no,headline,city
0,0,Zika Outbreak Hits Miami,Miami
1,1,Could Zika Reach New York City?,York
2,1,Could Zika Reach New York City?,York
3,1,Could Zika Reach New York City?,New York City
4,2,First Case of Zika in Miami Beach,Miami Beach


The state regexs output had total of 10 matches of 8 different names



Unnamed: 0,item_no,headline,state
0,1,Could Zika Reach New York City?,New York
1,13,"Flu outbreak in Galveston, Texas",Texas
2,17,Louisiana Zika cases up to 26,Louisiana
3,22,"Bad Water Leads to Sickness in Flint, Michigan",Michigan
4,60,Madison lab developing vaccine against Zika virus [The Wisconsin State Journal],Wisconsin


The county regexs output had total of 520 matches of 139 different names



Unnamed: 0,item_no,headline,county
0,0,Zika Outbreak Hits Miami,Miami
1,0,Zika Outbreak Hits Miami,Miami
2,0,Zika Outbreak Hits Miami,Miami
3,1,Could Zika Reach New York City?,York
4,1,Could Zika Reach New York City?,York


The country regexs output had total of 15 matches of 10 different names



Unnamed: 0,item_no,headline,country
0,3,"Mystery Virus Spreads in Recife, Brazil",Brazil
1,25,Zika cases in Vietnam's Ho Chi Minh City surge,Vietnam
2,30,Thailand Zika Virus in Bangkok,Thailand
3,44,"Zika outbreak in Piracicaba, Brazil",Brazil
4,58,"Zika surfaces in Klang, Malaysia",Malaysia


****************************************************************************************************
Matching has completed.......
****************************************************************************************************



### Regex matching result

In [0]:
city[city.city == 'York']

Unnamed: 0,item_no,headline,city
1,1,Could Zika Reach New York City?,York
2,1,Could Zika Reach New York City?,York


Note: We found some interesting patterns of matched results.

For example
 There were three matched results on a headline "Could Zika Reach New York City?"
 The first result was "New York City" which was a correct matched. 
 The rest two were "York" which could either be matched but why did we have two "York"?
 Check out on the `cities` dataframe then we found that there are two different locations of "York", one in the US and another one in the GB.

## Merge all dataframes of matched name result

In [0]:
# Merge all df of matched name
df_to_merge = (df, city, state, county, country)
merged = merge_names(df_to_merge)

## Remove all incompatible names that caused the multiple matched names for each headline

In [17]:
# Remove all incompatible names that caused the multiple matched names for each headline
result = remove_incompat(merged).sort_values(by='item_no')
# View result dataframe
result.style.applymap(color_null_red)

Unnamed: 0,item_no,headline,city,county,state,country
0,0,Zika Outbreak Hits Miami,Miami,Miami,,
1,1,Could Zika Reach New York City?,New York City,New York,New York,
2,2,First Case of Zika in Miami Beach,Miami Beach,Miami,,
3,3,"Mystery Virus Spreads in Recife, Brazil",Recife,,,Brazil
4,4,Dallas man comes down with case of Zika,Dallas,Dallas,,
5,5,Trinidad confirms first Zika case,Trinidad,,,
6,6,Zika Concerns are Spreading in Houston,Houston,Houston,,
7,7,Geneve Scientists Battle to Find Cure,Geneve,,,
8,8,The CDC in Atlanta is Growing Worried,Atlanta,,,
9,9,Zika Infested Monkeys in Sao Paulo,Sao Paulo,,,


In [18]:
print("\n\n" + "=" * 20 + "\n\n result.nunique() \n")
result.nunique()
print("\n\n" + "=" * 20 + "\n\n result.info() \n")
result.info()




 result.nunique() 



item_no     636
headline    648
city        577
county      134
state         8
country      10
dtype: int64




 result.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 648 entries, 0 to 646
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   item_no   648 non-null    int64 
 1   headline  648 non-null    object
 2   city      609 non-null    object
 3   county    170 non-null    object
 4   state     10 non-null     object
 5   country   15 non-null     object
dtypes: int64(1), object(5)
memory usage: 55.4+ KB


In [0]:
result.to_csv('city_result.csv', index=False)

In [0]:
DF = df.copy()
CITY = city.copy()
STATE = state.copy()
COUNTY = county.copy()
COUNTRY = country.copy()