# Cleaning WIKI pre-processed file

This notebook reads the pre-processed WIKI dataset from local data folder and performs the cleaning of country, LEI and company's names. The final result is saved locally and in the WIKI S3 bucket.

In [1]:
import pandas as pd
import os

In [2]:
# Import CountryCleaner
from financial_entity_cleaner.location import CountryCleaner

In [3]:
# Import the module for cleaning company's name
from financial_entity_cleaner.company import CompanyNameCleaner

In [4]:
# Import BankingIdCleaner
from financial_entity_cleaner.id import BankingIdCleaner

## 1. Read WIKI pre-processed file

In [5]:
# Save locally to "data" folder
saved_path = "..\..\data\pre_processed"
filename = "wiki_pre_processed.csv"

In [6]:
wiki_df = pd.read_csv(os.path.join(saved_path, filename))

In [7]:
wiki_df.head()

Unnamed: 0,short_name,aliases,company_name,country,lei
0,Boeing,"['The Boeing Company', 'Boeing Company']",THE BOEING COMPANY,United States of America,RVHJWBXLJ1RFUBSY1F30
1,Airbus,['Airbus Commercial Aircraft'],Airbus SE,France,529900FCMZ4LKXFD0R69
2,Google,"['Google Inc.', 'Google LLC']",Google LLC,United States of America,7ZW8QJWVPR4P1J1KQY45
3,Intel,"['Intel Corporation', 'N M Electronics', 'Inte...",Intel Corporation,United States of America,KNX4USFCNGPY45LOCE31
4,Apple Inc.,"['Apple Computer, Inc.', 'Apple Computer Inc',...",Apple Inc.,United States of America,HWUPKR0MPOU8FGXBT394


## 2. Cleaning County information

In [8]:
# Create an object based on CountryCleaner() class
country_cleaner_obj=CountryCleaner()

In [9]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [10]:
wiki_df = country_cleaner_obj.clean_df(df=wiki_df, cols=['country'], output_names_as= 'suffix')

Cleaning column [country]:  85%|██████████████████████████████████████████▎       | 168133/198509 [11:25<02:34, 197.02it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Cleaning column [country]:  92%|██████████████████████████████████████████████    | 182705/198509 [12:16<00:59, 264.99it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Cleaning column [country]:  94%|███████████████████████████████████████████████   | 186771/198509 [12:32

In [15]:
wiki_df.head()

Unnamed: 0,short_name,aliases,company_name,country,lei,country_short_name,country_alpha2
0,Boeing,"['The Boeing Company', 'Boeing Company']",THE BOEING COMPANY,United States of America,RVHJWBXLJ1RFUBSY1F30,united states,us
1,Airbus,['Airbus Commercial Aircraft'],Airbus SE,France,529900FCMZ4LKXFD0R69,france,fr
2,Google,"['Google Inc.', 'Google LLC']",Google LLC,United States of America,7ZW8QJWVPR4P1J1KQY45,united states,us
3,Intel,"['Intel Corporation', 'N M Electronics', 'Inte...",Intel Corporation,United States of America,KNX4USFCNGPY45LOCE31,united states,us
4,Apple Inc.,"['Apple Computer, Inc.', 'Apple Computer Inc',...",Apple Inc.,United States of America,HWUPKR0MPOU8FGXBT394,united states,us


## 3. Cleaning IDs

In [13]:
# Create an object based on the BankingIdCleaner() class
id_cleaner_obj = BankingIdCleaner()

In [14]:
# Setup cleaning properties
id_cleaner_obj.invalid_ids_as_nan = True
id_cleaner_obj.validation_as_categorical = True
id_cleaner_obj.output_lettercase = "BankingIdCleaner.UPPER_LETTER_CASE"

In [16]:
wiki_df = id_cleaner_obj.clean_df(wiki_df, cols=['lei'], remove_cols= True, 
                                   output_names_as= 'suffix', types = ['lei'])

Column [lei] Type [lei] : 100%|██████████████████████████████████████████████████| 198509/198509 [11:41<00:00, 283.01it/s]


In [25]:
wiki_df.head()

Unnamed: 0,short_name,aliases,company_name,country,country_short_name,country_alpha2,lei_cleaned_id,lei_isvalid_id,company_name_clean
0,Boeing,"['The Boeing Company', 'Boeing Company']",THE BOEING COMPANY,United States of America,united states,us,RVHJWBXLJ1RFUBSY1F30,1.0,the boeing company
1,Airbus,['Airbus Commercial Aircraft'],Airbus SE,France,france,fr,529900FCMZ4LKXFD0R69,1.0,airbus se
2,Google,"['Google Inc.', 'Google LLC']",Google LLC,United States of America,united states,us,7ZW8QJWVPR4P1J1KQY45,1.0,google limited liability company
3,Intel,"['Intel Corporation', 'N M Electronics', 'Inte...",Intel Corporation,United States of America,united states,us,KNX4USFCNGPY45LOCE31,1.0,intel corporation
4,Apple Inc.,"['Apple Computer, Inc.', 'Apple Computer Inc',...",Apple Inc.,United States of America,united states,us,HWUPKR0MPOU8FGXBT394,1.0,apple incorporated


## 5. Cleaning Company's names

In [26]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [27]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                              'remove_words_in_asterisk', 
                                              'remove_words_in_parentheses',
                                              'remove_question_marks_in_parentheses', 
                                              'replace_hyphen_by_space', 
                                              'replace_underscore_by_space', 
                                              'remove_text_puctuation_except_dot', 
                                              'remove_math_symbols', 
                                              'remove_parentheses', 
                                              'remove_brackets', 
                                              'remove_curly_brackets', 
                                              'remove_single_quote_next_character', 
                                              'remove_double_quote', 
                                              'enforce_single_space_between_words']

In [24]:
wiki_df = company_cleaner_obj.clean_df(wiki_df, 'company_name', 'company_name_clean', 'country_alpha2', 'True')

In [34]:
wiki_df.reset_index(inplace=True)
wiki_df = wiki_df.rename(columns = {'index':'company_id'})

In [35]:
wiki_df.head()

Unnamed: 0,company_id,short_name,aliases,company_name,country,country_short_name,country_alpha2,lei_cleaned_id,lei_isvalid_id,company_name_clean
0,0,Boeing,"['The Boeing Company', 'Boeing Company']",THE BOEING COMPANY,United States of America,united states,us,RVHJWBXLJ1RFUBSY1F30,1.0,the boeing company
1,1,Airbus,['Airbus Commercial Aircraft'],Airbus SE,France,france,fr,529900FCMZ4LKXFD0R69,1.0,airbus se
2,2,Google,"['Google Inc.', 'Google LLC']",Google LLC,United States of America,united states,us,7ZW8QJWVPR4P1J1KQY45,1.0,google limited liability company
3,3,Intel,"['Intel Corporation', 'N M Electronics', 'Inte...",Intel Corporation,United States of America,united states,us,KNX4USFCNGPY45LOCE31,1.0,intel corporation
4,4,Apple Inc.,"['Apple Computer, Inc.', 'Apple Computer Inc',...",Apple Inc.,United States of America,united states,us,HWUPKR0MPOU8FGXBT394,1.0,apple incorporated


In [36]:
print('Total companies in WIKI {}'.format(wiki_df.shape[0]))

Total companies in WIKI 198509


## 6. Save cleaned WIKI

In [37]:
# Save locally to "data" folder
saved_path = "..\..\data\cleaned"
filename = "wiki_cleaned.csv"
wiki_filename = os.path.join(saved_path, filename)
wiki_df.to_csv(wiki_filename, header=True, index=False)

In [38]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [39]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
bucket_name = os.environ['S3_LANDING_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

In [40]:
# Save final results to S3
s3_filename = 'WIKI/cleaned/wiki_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=wiki_filename, Bucket=bucket_name, Key=s3_filename)