In [1]:
# input a list of all possible locations
# output a json file that contains a dict of location to its actual country
import geonamescache
import json
import unidecode

gc = geonamescache.GeonamesCache()

In [2]:
def get_country_from_city(city: str) -> str:
    """If city is a recognized city, return the country the city is in."""

    search_result = gc.search_cities(city, contains_search=True)
    if search_result:
        return search_result[0]["countrycode"]
    else:
        return ""


def is_us_state(state: str, states_list: list) -> bool:
    """Given a list of valid US states, return True if input state is a valid US state.
    State input is lowercase."""

    return state in states_list


def get_country_code_from_country_name(name: str, countries_dict: dict) -> str:
    """Given a lowercase country name, return a country code if the country name is valid,
    an empty string otherwise."""

    # format country to have a capitalized name
    name = name.title()

    country_info = countries_dict.get(name)

    if not country_info:
        return ""

    code = country_info.get("iso")

    if code:
        return code
    else:
        return ""


def get_country_from_string(string: str, countries_list: list, states_list: str, countries_dict: dict) -> str:
    """Given an input string, return the associated country code if country can be found."""

    string = string.lower().strip()

    # check if full string contains a country
    for country in countries_list:
        # print(country)
        if country in string:
            return get_country_code_from_country_name(country, countries_dict)

    # check if full string is a city
    city = get_country_from_city(string)
    if city:
        return city

    # check by token
    tokens = string.split()

    for token in tokens:        
        if is_us_state(token, states_list):
            return "US"
        
        country = get_country_from_city(token)
        if country:
            return country

    return ""


def create_country_key_dict(location_list: list) -> dict:
    """Given a list of location strings, create a dictionary mapping 
    each location to the country it is in. If no country is found, 
    assign it an empty string."""

    country_names = list(gc.get_countries_by_names().keys())
    country_names = [country.lower() for country in country_names]

    state_names = list(gc.get_us_states_by_names().keys())
    state_names = [state.lower() for state in state_names]

    countries_dict = gc.get_countries_by_names()

    country_key = dict()
    
    for location in location_list:

        current_country = ""
        country = get_country_from_string(unidecode.unidecode(location), country_names, state_names, countries_dict)

        if country:
            country_key.update({location: country})
        else:
            country_key.update({location: ""})

    return country_key

In [3]:
with open("all_countries.json", "r") as fp:
    loaded_countries = json.load(fp)

country_names = [country.lower() for country in list(gc.get_countries_by_names().keys())]
state_names = list(gc.get_us_states_by_names().keys())
cd = gc.get_countries_by_names()

# print(get_country_from_string(" South Africa", country_names, state_names, cd))

confirmed_countries = []

for c in loaded_countries:
    result = get_country_from_string(c, country_names, state_names, cd)
    if result:
        confirmed_countries.append(result)
    # else:
    #     confirmed_countries.append("NAN")

print(f"number of confirmed countries: {len(confirmed_countries)}")
print(f"number of input countries: {len(loaded_countries)}")
print(f"fraction of countries confirmed: {len(confirmed_countries) / len(loaded_countries)}")

print(confirmed_countries)

number of confirmed countries: 1026
number of input countries: 1290
fraction of countries confirmed: 0.7953488372093023
['US', 'IT', 'AZ', 'IT', 'MX', 'BE', 'FR', 'IR', 'MX', 'CL', 'IN', 'VE', 'ES', 'ZA', 'FR', 'SE', 'AD', 'HU', 'CA', 'VE', 'FR', 'AT', 'IT', 'AR', 'AD', 'AU', 'AE', 'IT', 'AE', 'FR', 'CO', 'FI', 'FR', 'BF', 'NL', 'CL', 'MX', 'CO', 'AE', 'IN', 'FI', 'MY', 'UA', 'EE', 'UA', 'TR', 'AE', 'AE', 'BO', 'JE', 'AT', 'IN', 'MX', 'MX', 'BG', 'CO', 'NL', 'CO', 'AR', 'SE', 'BO', 'US', 'AR', 'AR', 'US', 'AD', 'US', 'DE', 'CA', 'CO', 'IT', 'RU', 'US', 'BS', 'AR', 'AD', 'FI', 'AM', 'FR', 'IN', 'FR', 'DE', 'IN', 'AD', 'BO', 'FR', 'DE', 'AE', 'VE', 'PA', 'MX', 'TZ', 'JP', 'IT', 'FI', 'MX', 'UA', 'BE', 'FR', 'AE', 'EC', 'MX', 'FR', 'IE', 'BE', 'AD', 'AD', 'CU', 'BA', 'NP', 'VE', 'AI', 'AF', 'CO', 'IT', 'AR', 'AD', 'CA', 'VE', 'AE', 'US', 'SE', 'CA', 'MX', 'FI', 'BD', 'IT', 'EG', 'MX', 'MX', 'AE', 'AD', 'AL', 'US', 'AD', 'AD', 'PY', 'AR', 'PA', 'MX', 'MX', 'IT', 'RU', 'DM', 'BR', 'MX', 'FR

In [4]:
with open("all_countries.json", "r") as fp:
    loaded_countries = json.load(fp)

key = create_country_key_dict(loaded_countries)

print(
    key
)

with open("location_key.json", "w") as fp:
    json.dump(key, fp)

{'Evanston': 'US', 'Rusia ': 'IT', 'caba ': 'AZ', 'Sudamérica': '', 'Rome, Lazio': 'IT', 'Coacalco, Mexico': 'MX', 'Mars': 'BE', 'Antibes ☀': 'FR', 'Jambudweep': '', 'Islamic Republic of Iran': 'IR', 'Pachuca de Soto, Hidalgo': 'MX', 'Chile': 'CL', 'Everywhere around your mind.': 'IN', 'Chacao, Venezuela': 'VE', 'La laguna': 'ES', 'Johannesburg, South Africa': 'ZA', 'FRANCE': 'FR', 'Stockholm, Sverige': 'SE', 'Langley, VA (Adjacent)': 'AD', 'Polcenigo, Friuli-Venezia Giul': 'HU', 'Estados Unidos': '', 'Liège, Belgique': '', 'mondo': 'CA', 'Estado Trujillo Venezuela.': 'VE', 'QRO': 'FR', 'Vienna, Austria': 'AT', 'Rome, Italy': 'IT', 'COMO 🇲🇽 NO HAY DOS. DF.': 'AR', 'l': 'AD', 'Somewhere off Sevastopol coast': 'AU', 'Somewhere in the world': 'AE', 'Piossasco': 'IT', 'in giro': 'AE', "Sous l'escalier, France": 'FR', 'Bogota': 'CO', 'Helsinki Finland': 'FI', 'France Occitanie': 'FR', 'Burkina Faso': 'BF', 'Schagen The Netherlands': 'NL', 'Wano': 'CL', 'Durango, México': 'MX', 'Tierra Perdi