In [1]:
import numpy as np
import pandas as pd
import pycountry as pc
import ast
import re

In [21]:
translation_file = './countries-translation/global_names.csv'
translation_df = pd.read_csv(translation_file, keep_default_na=False, na_values=[''])
translation_df.head()


Unnamed: 0.1,Unnamed: 0,continent,english_name,native_names,alpha_2
0,0,europe,albania,"['shqipëria', 'republika e shqipërisë', 'repub...",AL
1,1,europe,andorra,"['andorra', ""principat d'andorra"", 'principali...",AD
2,2,europe,austria,"['österreich', 'republik österreich', 'republi...",AT
3,3,europe,azerbaijan,"['azərbaycan', 'азәрбајҹан', 'آذربايجان', 'azə...",AZ
4,4,europe,belarus,"['biełaruśбеларусь', 'respublika biełaruś', 'р...",BY


In [22]:
translation_df[translation_df['alpha_2'].isnull().values == True]

Unnamed: 0.1,Unnamed: 0,continent,english_name,native_names,alpha_2
17,17,europe,alsace,"['alsace', 'elsàss', 'elsass / elsaß']",
19,19,europe,languedoc-roussillon,"['languedoc-roussillon', 'lengadòc-rosselhon',...",


In [4]:
ast.literal_eval(translation_df.iloc[0]['native_names'])

['shqipëria', 'republika e shqipërisë', 'republic of albania']

In [48]:
class CountryLookup:
    def __init__(self, translation_file):
        translation_df = pd.read_csv(translation_file, keep_default_na=False, na_values=[''])
        clean_df = translation_df[translation_df['alpha_2'].isnull().values == False]
        translation_dict = {}
        
        for _, row in clean_df.iterrows():
            alpha_2 = row['alpha_2']
            translation_dict[row['english_name']] = alpha_2
            
            native_names = ast.literal_eval(row['native_names'])
            for native in native_names:
                translation_dict[native] = alpha_2
                
            pc_record = pc.countries.lookup(alpha_2)
#             translation_dict[alpha_2.lower()] = alpha_2
            translation_dict[pc_record.name.lower()] = alpha_2
    
            try:
                translation_dict[pc_record.official_name.lower()] = alpha_2
            except AttributeError:
                continue
            
        self.translation_dict = translation_dict
        
    def __getitem__(self, key):
        key = key.lower()
        key_parts = re.split(r'[?.,-]', key)
        key_parts = [k for k in key_parts if len(k) > 0]
        for k in key_parts:
            k = ' '.join(k.split())
            lookup = self.translation_dict.get(k)
            if lookup is not None:
                return lookup
            
        return None

In [49]:
lookup = CountryLookup(translation_file)
lookup.translation_dict

{'albania': 'AL',
 'shqipëria': 'AL',
 'republika e shqipërisë': 'AL',
 'republic of albania': 'AL',
 'andorra': 'AD',
 "principat d'andorra": 'AD',
 'principality of andorra': 'AD',
 'austria': 'AT',
 'österreich': 'AT',
 'republik österreich': 'AT',
 'republic of austria': 'AT',
 'azerbaijan': 'AZ',
 'azərbaycan': 'AZ',
 'азәрбајҹан': 'AZ',
 'آذربايجان': 'AZ',
 'azərbaycan respublikası': 'AZ',
 'азәрбајҹан республикаси': 'AZ',
 'azerbaijani republic': 'AZ',
 'republic of azerbaijan': 'AZ',
 'belarus': 'BY',
 'biełaruśбеларусь': 'BY',
 'respublika biełaruś': 'BY',
 'рэспублика беларусь': 'BY',
 'republic of belarus': 'BY',
 'belgium': 'BE',
 'belgië': 'BE',
 'belgique': 'BE',
 'belgien': 'BE',
 'koninkrijk belgië': 'BE',
 'royaume de belgique': 'BE',
 'königreich belgien': 'BE',
 'kingdom of belgium': 'BE',
 'bosnia and herzegovina': 'BA',
 'bosna i hercegovina': 'BA',
 'босна и херцеговина': 'BA',
 'republic of bosnia and herzegovina': 'BA',
 'bulgaria': 'BG',
 'българия': 'BG',
 'ре

In [51]:
lookup['pakistan']

'PK'

In [11]:
samples = ['Chatburn, England. ', ' ✈', 'Namibia, Windhoek', 'England',
       'Jacksonville, FL', 'Destroying autotune rap music',
       'Philadelphia, PA', 'These are personal comments.',
       'm ( a r t ) a', 'Straight outta Comte ', 'Ile-de-France, France',
       'Pie Barm', 'Baltimore, Maryland', 'SW Michigan',
       'Ikorudu, Nigeria', 'In the Eye of the Storm',
       'Pietermaritzburg, South Africa', 'Lehigh Valley PA',
       'Los Angeles, CA', 'Mansehra/Islamabad, Pakistan', 'Germany',
       'vmin’s pockets', 'Uk', 'Udupi | Mumbai | Dubai | ', 'Brussels',
       'Shenandoah Valley, Virginia', 'Australia',
       'Perth, Western Australia', 'New Jersey', 'L.A. - MANILA',
       'Tennessee', 'London', 'Gwangju, Korea', 'United States',
       'England, United Kingdom', 'New Delhi, India', 'London, England',
       'Cramlington, England', 'Karachi, Pakistan', 'London, UK']

In [12]:
for s in samples:
    print(s, lookup[s])

Chatburn, England.  GB
 ✈ None
Namibia, Windhoek NA
England GB
Jacksonville, FL None
Destroying autotune rap music None
Philadelphia, PA None
These are personal comments. None
m ( a r t ) a None
Straight outta Comte  None
Ile-de-France, France FR
Pie Barm None
Baltimore, Maryland None
SW Michigan None
Ikorudu, Nigeria NG
In the Eye of the Storm None
Pietermaritzburg, South Africa ZA
Lehigh Valley PA None
Los Angeles, CA None
Mansehra/Islamabad, Pakistan PK
Germany DE
vmin’s pockets None
Uk None
Udupi | Mumbai | Dubai |  None
Brussels None
Shenandoah Valley, Virginia None
Australia AU
Perth, Western Australia None
New Jersey None
L.A. - MANILA None
Tennessee None
London None
Gwangju, Korea KP
United States US
England, United Kingdom GB
New Delhi, India IN
London, England GB
Cramlington, England GB
Karachi, Pakistan PK
London, UK None
