In [1]:
import numpy as np
import pandas as pd
import json
import glob
import pycountry
from html.parser import HTMLParser
from bs4 import BeautifulSoup
import re

## parse html tables to dataframe

countries names source:
https://www.omniglot.com/countries/

In [2]:
def clean_fun(s):
    
    # remove text in (brackets)
    s = re.sub('\(.*\)', '', s)
    
    # remove multiple and trailing whitespaces
    s = ' '.join(s.split())
    
    # remove [] chars
    for bracket in '[]':
        s = s.replace(bracket, '')
        
    # to lower case
    s = s.lower()
    
    return s

def parse_country(soup, clean_fun=clean_fun):
    english_name, native_names = soup.find_all('td')
    english_name = english_name.get_text().lower()
    native_names = native_names.get_text().lower().split('\n')
    
    if clean_fun is not None:
        english_name = clean_fun(english_name)
        native_names = [clean_fun(n) for n in native_names]
        native_names = [n for n in native_names if n != '']
    
    return english_name, native_names

def country_mapping_from_html(html_file, continent):
    # parse html file, create dataframe with english name, native names and alpha 2 code
    
    with open(html_file, 'r') as file:
        html_string = file.read()

    soup = BeautifulSoup(html_string, 'html.parser')
    countries = soup.find_all('tr')
    
    records = []
    for c in countries:
        english_name, native_names = parse_country(c)
        
        alpha_2 = None
        
        for pattern in [english_name] + native_names:
            try:
                country_data = pycountry.countries.search_fuzzy(pattern)
                alpha_2 = country_data[0].alpha_2
            
            except LookupError:
                pass
            
            if alpha_2 is not None:
                break
        
        record = {
            'continent': continent,
            'english_name': english_name,
            'native_names': native_names,
            'alpha_2': alpha_2
        }
        
        records.append(record)
            
    return pd.DataFrame(records)

In [3]:
%%time
html_files = glob.glob('./countries-translation/*.html')
continents = [file.split('/')[-1].split('.')[0] for file in html_files]

continent_mappings = [country_mapping_from_html(f, c) for f, c in zip(html_files, continents)]

CPU times: user 14.6 s, sys: 12.7 ms, total: 14.6 s
Wall time: 14.6 s


In [4]:
global_mapping = pd.concat(continent_mappings, ignore_index=True)
len(global_mapping)

234

In [6]:
global_mapping[global_mapping['alpha_2'].isnull().values]

Unnamed: 0,continent,english_name,native_names,alpha_2
17,europe,alsace,"[alsace, elsàss, elsass / elsaß]",
19,europe,languedoc-roussillon,"[languedoc-roussillon, lengadòc-rosselhon, lle...",
119,asia,burma / myanmar,"[မြန်မာ, ဗမာ, ပြည်ထောင်စု သမ္မတ မြန်မာနိုင်ငံတ...",
123,asia,macau,"[澳門, (ngoumún,, àomén), macau, 澳門特別行政區, (ngoum...",


In [11]:
pycountry.countries.search_fuzzy('myanmar')

[Country(alpha_2='MM', alpha_3='MMR', name='Myanmar', numeric='104', official_name='Republic of Myanmar')]

In [12]:
pycountry.countries.search_fuzzy('macao')

[Country(alpha_2='MO', alpha_3='MAC', name='Macao', numeric='446', official_name='Macao Special Administrative Region of China'),
 Country(alpha_2='CN', alpha_3='CHN', name='China', numeric='156', official_name="People's Republic of China")]

In [15]:
global_mapping.iloc[119]['alpha_2'] = 'MM'
global_mapping.iloc[123]['alpha_2'] = 'MO'

In [19]:
global_mapping.to_csv('./countries-translation/global_names.csv')