# How to...normalize country information

This notebook shows how to use the entitymatching api to normalize information about countries.

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../')

In [2]:
# Import the module for normalizing country information
from financial_entity_cleaner.country_cleaner import country

## 1. Basic usage

The API allows to get full information about a country (alpha2, alpha3 and name) by passing one of these data to the get_country_info() method. The result of the search can be None (country not found) or a dictionary.

In [3]:
# Create a CountryCleaner object
country_cleaner_obj=country.CountryCleaner()

In [4]:
# Good examples of country information that can be found in data sources
country_alpha2 = 'PT'
country_alpha3 = 'BRA'
country_name = ' China '

In [5]:
# Get the complete information about a country given and alpha2 code
country_info_dict = country_cleaner_obj.get_country_info(country_alpha2)
print('Complete country information: {}'.format(country_info_dict))

Complete country information: {'country_name_clean': 'portugal', 'country_alpha2_clean': 'pt', 'country_alpha3_clean': 'prt'}


In [6]:
# Get the complete information about a country given alpha3 code
country_info_dict = country_cleaner_obj.get_country_info(country_alpha3)
print('Complete country information: {}'.format(country_info_dict))

Complete country information: {'country_name_clean': 'brazil', 'country_alpha2_clean': 'br', 'country_alpha3_clean': 'bra'}


In [7]:
# Get the complete information about a country given a name
country_info_dict = country_cleaner_obj.get_country_info(country_name)
print('Complete country information: {}'.format(country_info_dict))

Complete country information: {'country_name_clean': 'china', 'country_alpha2_clean': 'cn', 'country_alpha3_clean': 'chn'}


## 2. Some bad examples (country not found)

The API performs an exact matching between the value passed by parameter and its internal dictionary of country information. Therefore, if the information contains strange characters or the name is not a country's name, the API will return a None object.

In [8]:
# Bad examples of country information that can be found in data sources
country_alpha2 = '123'
country_alpha3 = '%fff'
country_name = ' Chinatown '

In [9]:
# Get the complete information about a country given and alpha2 code
print(country_cleaner_obj.get_country_info(country_alpha2))

{'country_name_clean': nan, 'country_alpha2_clean': nan, 'country_alpha3_clean': nan}


In [10]:
# Get the complete information about a country given alpha3 code
print(country_cleaner_obj.get_country_info(country_alpha3))

{'country_name_clean': nan, 'country_alpha2_clean': nan, 'country_alpha3_clean': nan}


In [11]:
# Get the complete information about a country given a name
print(country_cleaner_obj.get_country_info(country_name))

{'country_name_clean': nan, 'country_alpha2_clean': nan, 'country_alpha3_clean': nan}


## 3. Customizing the output letter case

It is possible to indicate in the output_lettercase parameter of the API that the result must be in lower, upper or title case. By default, the API uses output_lettercase='lower'.

In [12]:
# Good examples of country information that can be found in data sources
country_alpha2 = 'no'
country_alpha3 = 'rus'
country_name = 'United kingdom'

In [13]:
# Set up the resultant letter case
country_cleaner_obj.output_lettercase='upper'

In [14]:
# Get the complete information about a country given and alpha2 code
country_info_dict = country_cleaner_obj.get_country_info(country_name)
print('Complete country information: {}'.format(country_info_dict))

Complete country information: {'country_name_clean': 'united kingdom of great britain and northern ireland', 'country_alpha2_clean': 'gb', 'country_alpha3_clean': 'gbr'}


In [15]:
# Set up the resultant letter case
country_cleaner_obj.output_lettercase='title'

In [16]:
# Get the complete information about a country given and alpha2 code
country_info_dict = country_cleaner_obj.get_country_info(country_alpha2)
print('Complete country information: {}'.format(country_info_dict))

Complete country information: {'country_name_clean': 'norway', 'country_alpha2_clean': 'no', 'country_alpha3_clean': 'nor'}


## 4. Cleaning a dataframe

In [17]:
import pandas as pd

In [18]:
input_filename = '../../tests/data/test_cleaner_country.csv'

In [19]:
df_original = pd.read_csv(input_filename,sep=',',encoding='utf-8')

In [20]:
df_original

Unnamed: 0,COUNTRY,ALPHA2,ALPHA3,NAME
0,NC,nc,ncl,new caledonia
1,Fr,fr,fra,france
2,CA,ca,can,canada
3,IT,it,ita,italy
4,es,es,esp,spain
5,ZZ,,,
6,XZ,,,
7,usa,us,usa,united states of america
8,Arg,ar,arg,argentina
9,AUS,au,aus,australia


In [21]:
# Set up the resultant letter case
country_cleaner_obj.output_lettercase='lower'

In [22]:
df_cleaner = country_cleaner_obj.apply_cleaner_to_df(df_original, 'COUNTRY')

In [23]:
df_cleaner

Unnamed: 0,COUNTRY,ALPHA2,ALPHA3,NAME,country_name_clean,country_alpha2_clean,country_alpha3_clean
0,NC,nc,ncl,new caledonia,new caledonia,nc,ncl
1,Fr,fr,fra,france,france,fr,fra
2,CA,ca,can,canada,canada,ca,can
3,IT,it,ita,italy,italy,it,ita
4,es,es,esp,spain,spain,es,esp
5,ZZ,,,,,,
6,XZ,,,,,,
7,usa,us,usa,united states of america,united states of america,us,usa
8,Arg,ar,arg,argentina,argentina,ar,arg
9,AUS,au,aus,australia,australia,au,aus


In [24]:
# Naming the output columns
country_cleaner_obj.country_name_output = 'CLEAN_COUNTRY_NAME'
country_cleaner_obj.country_alpha2_output = 'CLEAN_ALPHA2'
country_cleaner_obj.country_alpha3_output = 'CLEAN_ALPHA3'

In [25]:
df_cleaner = country_cleaner_obj.apply_cleaner_to_df(df_original, 'COUNTRY')

In [26]:
df_cleaner

Unnamed: 0,COUNTRY,ALPHA2,ALPHA3,NAME,CLEAN_COUNTRY_NAME,CLEAN_ALPHA2,CLEAN_ALPHA3
0,NC,nc,ncl,new caledonia,new caledonia,nc,ncl
1,Fr,fr,fra,france,france,fr,fra
2,CA,ca,can,canada,canada,ca,can
3,IT,it,ita,italy,italy,it,ita
4,es,es,esp,spain,spain,es,esp
5,ZZ,,,,,,
6,XZ,,,,,,
7,usa,us,usa,united states of america,united states of america,us,usa
8,Arg,ar,arg,argentina,argentina,ar,arg
9,AUS,au,aus,australia,australia,au,aus


## 5. Cleaning a csv file with AutoCleaner

In [27]:
# Import the module for normalizing country information
from financial_entity_cleaner.auto_cleaner import auto_cleaner

In [28]:
# Create an AutoCleaner object
auto_cleaner_obj=auto_cleaner.AutoCleaner()

C:\Users\Patrycia\OneDrive\Projects\GitHub\os_climate\financial-entity-cleaner\financial_entity_cleaner\auto_cleaner\logs


In [29]:
input_filename = '../../tests/data/test_cleaner_country.csv'

In [30]:
setup_cleaning_filename = '../../tests/data/test_cleaner_country.json'

In [31]:
output_filename = '../../tests/data/test_cleaner_country_result.csv'

In [32]:
auto_cleaner_obj.clean_csv_file(input_filename, setup_cleaning_filename, output_filename)

True