# Cleaning CORPWATCH pre-processed file

This notebook reads the pre-processed CORPWATCH dataset from local data folder and performs the cleaning of country and company's names. The final result is saved locally and in the CORPWATCH S3 bucket.

In [1]:
import pandas as pd
import os

from financial_entity_cleaner.location import CountryCleaner
from financial_entity_cleaner.company import CompanyNameCleaner
from financial_entity_cleaner.id import BankingIdCleaner
pd.set_option('display.max_columns',50)
from dotenv import dotenv_values, load_dotenv

In [2]:
env_var = dotenv_values('../../.env')

In [3]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=env_var['S3_ENDPOINT'],
    aws_access_key_id=env_var['S3_ACCESS_KEY'],
    aws_secret_access_key=env_var['S3_SECRET_KEY'],
)
bucket_name = env_var['S3_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 1. Read CORPWATCH pre-processed file

In [4]:
corpwatch = bucket.Object('CorpWatch/pre_processed/corpwatch_pre_processed.csv').get()['Body']
corpwatch_df = pd.read_csv(corpwatch, encoding='utf-8', delimiter=',', low_memory=False)

In [5]:
corpwatch_df.head()

Unnamed: 0,cw_id,cik,company_name,city,country_code,company_name_lei,lei
0,781188.0,13.0,CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM...,NEW YORK,US,,
1,1.0,20.0,K TRON INTERNATIONAL INC,PITMAN,US,,
2,1249859.0,63.0,FNW BANCORP INC,ELGIN,US,,
3,2.0,1750.0,AAR CORP,WOOD DALE,US,Aar Corp,MP76T5YQX3YK5VVAQ802
4,3.0,1800.0,ABBOTT LABORATORIES,ABBOTT PARK,US,Abbott Laboratories,549300ZYY7524VF6JP88


## 2. Cleaning County information

In [6]:
# Create an object based on CountryCleaner() class
country_cleaner_obj=CountryCleaner()

In [7]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [8]:
corpwatch_df = country_cleaner_obj.clean_df(df=corpwatch_df, cols=['country_code'], output_names_as= 'suffix')

Normalizing countries... 44%|█████████████████████▉                            | 315884/719598 [00:56<01:12, 5598.46it/s]


KeyboardInterrupt: 

In [12]:
corpwatch_df.drop('country_code', axis=1, inplace=True)

In [13]:
corpwatch_df.head()

Unnamed: 0,cw_id,cik,company_name,city,company_name_lei,lei,country_code_short_name,country_code_alpha2
0,781188.0,13.0,CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM...,NEW YORK,,,united states,us
1,1.0,20.0,K TRON INTERNATIONAL INC,PITMAN,,,united states,us
2,1249859.0,63.0,FNW BANCORP INC,ELGIN,,,united states,us
3,2.0,1750.0,AAR CORP,WOOD DALE,Aar Corp,MP76T5YQX3YK5VVAQ802,united states,us
4,3.0,1800.0,ABBOTT LABORATORIES,ABBOTT PARK,Abbott Laboratories,549300ZYY7524VF6JP88,united states,us


## 3. Cleaning LEI

In [15]:
# Create an object based on the BankingIdCleaner() class
id_cleaner_obj = BankingIdCleaner()

In [17]:
# Setup cleaning properties
id_cleaner_obj.invalid_ids_as_nan = True
id_cleaner_obj.validation_as_categorical = True
id_cleaner_obj.output_lettercase = "BankingIdCleaner.UPPER_LETTER_CASE"

In [18]:
corpwatch_df = id_cleaner_obj.clean_df(corpwatch_df, cols=['lei'], 
                                     remove_cols= True, 
                                     output_names_as= 'suffix',
                                     types = ['lei'])

Column [lei] Type [lei] : 100%|██████████████████████████████████████████████████| 719598/719598 [38:54<00:00, 308.19it/s]


In [21]:
corpwatch_df.drop('lei_isvalid_id', axis=1, inplace=True)

In [22]:
corpwatch_df.head()

Unnamed: 0,cw_id,cik,company_name,city,company_name_lei,country_code_short_name,country_code_alpha2,lei_cleaned_id
0,781188.0,13.0,CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM...,NEW YORK,,united states,us,
1,1.0,20.0,K TRON INTERNATIONAL INC,PITMAN,,united states,us,
2,1249859.0,63.0,FNW BANCORP INC,ELGIN,,united states,us,
3,2.0,1750.0,AAR CORP,WOOD DALE,Aar Corp,united states,us,MP76T5YQX3YK5VVAQ802
4,3.0,1800.0,ABBOTT LABORATORIES,ABBOTT PARK,Abbott Laboratories,united states,us,549300ZYY7524VF6JP88


## 4. Cleaning Company's names

In [23]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [24]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                                'remove_words_in_asterisk', 
                                                'remove_words_in_parentheses',
                                                'remove_question_marks_in_parentheses', 
                                                'replace_hyphen_by_space', 
                                                'replace_underscore_by_space', 
                                                'remove_text_puctuation_except_dot', 
                                                'remove_math_symbols', 
                                                'remove_parentheses', 
                                                'remove_brackets', 
                                                'remove_curly_brackets', 
                                                'remove_single_quote_next_character', 
                                                'remove_double_quote', 
                                                'enforce_single_space_between_words']
company_cleaner_obj.post_cleaning_rules = ['remove_all_punctuation']

In [25]:
corpwatch_df = company_cleaner_obj.clean_df(corpwatch_df, 'company_name', 'company_name_clean', 'country_code_alpha2', 'True')

In [26]:
corpwatch_df = company_cleaner_obj.clean_df(corpwatch_df, 'company_name_lei', 'company_name_lei_clean', 'country_code_alpha2', 'True')

In [42]:
corpwatch_df.reset_index(inplace=True)
corpwatch_df = corpwatch_df.rename(columns = {'index':'company_id'})

In [45]:
corpwatch_df.head()

Unnamed: 0,company_id,cw_id,cik,company_name,city,company_name_lei,country_code_short_name,country_code_alpha2,lei_cleaned_id,company_name_clean,company_name_lei_clean
0,0,781188.0,13.0,CORPORATE INCOME FUND SEVENTY NINTH SHORT TERM...,NEW YORK,,united states,us,,corporate income fund seventy ninth short term...,
1,1,1.0,20.0,K TRON INTERNATIONAL INC,PITMAN,,united states,us,,k tron international incorporated,
2,2,1249859.0,63.0,FNW BANCORP INC,ELGIN,,united states,us,,fnw bancorp incorporated,
3,3,2.0,1750.0,AAR CORP,WOOD DALE,Aar Corp,united states,us,MP76T5YQX3YK5VVAQ802,aar corporation,aar corporation
4,4,3.0,1800.0,ABBOTT LABORATORIES,ABBOTT PARK,Abbott Laboratories,united states,us,549300ZYY7524VF6JP88,abbott laboratories,abbott laboratories


In [46]:
print('Total companies in CORPWATCH {}'.format(corpwatch_df.shape[0]))

Total companies in CORPWATCH 719598


## 4. Save cleaned CORPWATCH

In [47]:
# Save locally to "dataset" folder
saved_path = "../../../dataset/cleaned"
filename = "corpwatch_cleaned.csv"
corpwatch_filename = os.path.join(saved_path, filename)
corpwatch_df.to_csv(corpwatch_filename, header=True, index=False)

In [None]:
# Save final results to S3
s3_filename = 'CorpWatch/cleaned/corpwatch_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=corpwatch_filename, Bucket=bucket_name, Key=s3_filename)