# Cleaning WRI pre-processed file

This notebook reads the pre-processed WRI dataset from local data folder and performs the cleaning of country and company's names. The final result is saved locally and in the WRI S3 bucket.

In [3]:
import pandas as pd
import os

from financial_entity_cleaner.location import CountryCleaner
from financial_entity_cleaner.company import CompanyNameCleaner
from financial_entity_cleaner.id import BankingIdCleaner
pd.set_option('display.max_columns',50)
from dotenv import dotenv_values, load_dotenv

In [4]:
env_var = dotenv_values('../../.env')

In [6]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=env_var['S3_ENDPOINT'],
    aws_access_key_id=env_var['S3_ACCESS_KEY'],
    aws_secret_access_key=env_var['S3_SECRET_KEY'],
)
bucket_name = env_var['S3_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 1. Read WRI pre-processed file

In [11]:
wri = bucket.Object('WRI/pre_processed/wri_pre_processed.csv').get()['Body']
wri_df = pd.read_csv(wri, encoding='utf-8', delimiter=',', low_memory=False)

In [12]:
wri_df.head()

Unnamed: 0,company_name,country
0,SociÃ©te AlgÃ©rienne de Production de l\'Elect...,DZA
1,SociÃ©tÃ© AlgÃ©rienne de Production de lâ€™Ele...,DZA
2,Sonelgaz,DZA
3,Sonelgaz Production de lâ€™Electricite,DZA
4,Sharikat Kahraba Hadjret En-Nouss,DZA


## 2. Cleaning County information

In [13]:
# Create an object based on CountryCleaner() class
country_cleaner_obj=CountryCleaner()

In [14]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [15]:
wri_df = country_cleaner_obj.clean_df(df=wri_df, cols=['country'], output_names_as= 'suffix')

Normalizing countries...100%|██████████████████████████████████████████████████| 10181/10181 [00:01<00:00, 5588.54it/s]


In [16]:
wri_df.drop('country', axis=1, inplace=True)

In [17]:
wri_df.head()

Unnamed: 0,company_name,country_short_name,country_alpha2
0,SociÃ©te AlgÃ©rienne de Production de l\'Elect...,algeria,dz
1,SociÃ©tÃ© AlgÃ©rienne de Production de lâ€™Ele...,algeria,dz
2,Sonelgaz,algeria,dz
3,Sonelgaz Production de lâ€™Electricite,algeria,dz
4,Sharikat Kahraba Hadjret En-Nouss,algeria,dz


## 3. Cleaning Company's names

In [18]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [19]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                                'remove_words_in_asterisk', 
                                                'remove_words_in_parentheses',
                                                'remove_question_marks_in_parentheses', 
                                                'replace_hyphen_by_space', 
                                                'replace_underscore_by_space', 
                                                'remove_text_puctuation_except_dot', 
                                                'remove_math_symbols', 
                                                'remove_parentheses', 
                                                'remove_brackets', 
                                                'remove_curly_brackets', 
                                                'remove_single_quote_next_character', 
                                                'remove_double_quote', 
                                                'enforce_single_space_between_words']
company_cleaner_obj.post_cleaning_rules = ['remove_all_punctuation']

In [20]:
wri_df = company_cleaner_obj.clean_df(wri_df, 'company_name', 'company_name_clean', 'country_alpha2', 'True')

Cleaning company name...100%|██████████████████████████████████████████████████| 92/92 [00:03<00:00, 30.66it/s]


In [21]:
wri_df.reset_index(inplace=True)
wri_df = wri_df.rename(columns = {'index':'company_id'})

In [22]:
wri_df.head()

Unnamed: 0,company_id,company_name,country_short_name,country_alpha2,company_name_clean
0,0,SociÃ©te AlgÃ©rienne de Production de l\'Elect...,algeria,dz,socia c te alga c rienne de production de l c
1,1,SociÃ©tÃ© AlgÃ©rienne de Production de lâ€™Ele...,algeria,dz,socia c ta c alga c rienne de production de la...
2,2,Sonelgaz,algeria,dz,sonelgaz
3,3,Sonelgaz Production de lâ€™Electricite,algeria,dz,sonelgaz production de laEUR tm electricite
4,4,Sharikat Kahraba Hadjret En-Nouss,algeria,dz,sharikat kahraba hadjret en nouss


In [23]:
print('Total companies in WRI {}'.format(wri_df.shape[0]))

Total companies in WRI 10181


## 4. Save cleaned WRI

In [24]:
# Save locally to "data" folder
saved_path = "../../../dataset/cleaned/"
filename = "wri_cleaned.csv"
wri_filename = os.path.join(saved_path, filename)
wri_df.to_csv(wri_filename, header=True, index=False)

In [25]:
# Save final results to S3
s3_filename = 'WRI/cleaned/wri_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=wri_filename, Bucket=bucket_name, Key=s3_filename)