# Cleaning RMI pre-processed file

...

In [51]:
import pandas as pd
import os
from dotenv import dotenv_values, load_dotenv

# Import CountryCleaner
from financial_entity_cleaner.location import CountryCleaner

# Import the module for cleaning company's name
from financial_entity_cleaner.company import CompanyNameCleaner

# Import BankingIdCleaner
from financial_entity_cleaner.id import BankingIdCleaner

In [52]:
env_var = dotenv_values('../../.env')

In [53]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=env_var['S3_ENDPOINT'],
    aws_access_key_id=env_var['S3_ACCESS_KEY'],
    aws_secret_access_key=env_var['S3_SECRET_KEY'],
)
bucket_name = env_var['S3_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 1. Read the RMI pre-processed file

In [54]:
# Save locally to "dataset" folder
saved_path = "../../../dataset/pre_processed/"
filename = "utilities_pre_processed.csv"

In [55]:
utility_df = pd.read_csv(os.path.join(saved_path, filename), low_memory=False)

In [56]:
utility_df.head()

Unnamed: 0,company_id,parent_name,parent_lei,ticker,isin,utility_name,utility_id_ferc1,utility_id_ferc1_dbf,utility_id_ferc1_xbrl,utility_id_eia,utility_lei,fraction_owned_utility,entity_type_eia,utility_type_rmi,public_private_unmapped,duplicate_utility_id_eia,country
0,0,"American Electric Power Co., Inc.",1B4S6S7G0TW5EE83BO58,AEP,US0255371017,AEP Generating Co.,342.0,1.0,C003184,343.0,,1.0,Investor Owned,Independent Power Producer,public,False,USA
1,1,Southern Co.,549300FC3G3YU2FBZD92,SO,US8425871071,Alabama Power Co.,294.0,2.0,C001552,195.0,0RL818ELFOHP5JHOFU19,1.0,Investor Owned,Vertically Integrated,public,False,USA
2,2,Avista Corp.,Q0IK63NITJD6RJ47SW96,AVA,US05379B1070,Alaska Electric Light & Power Co.,394.0,3.0,C011150,213.0,,1.0,Investor Owned,Vertically Integrated,public,False,USA
3,3,Alcoa Corp.,549300T12EZ1F6PWWU29,AA,US0138721065,Alcoa Generating Corp.,349.0,4.0,C003448,,,1.0,,Independent Power Producer,public,False,USA
4,4,FirstEnergy Corp.,549300SVYJS666PQJH88,FE,US3379321074,The Allegheny Generating Co.,239.0,5.0,C000911,6458.0,,1.0,Retail Power Marketer,Independent Power Producer,public,False,USA


In [57]:
utility_df['utility_name'].nunique()

13797

## 2. Cleaning country information

In [58]:
country_cleaner_obj=CountryCleaner()

In [59]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [60]:
utility_df = country_cleaner_obj.clean_df(df=utility_df, 
                                            cols=['country'],
                                            output_names_as= 'suffix')

Normalizing countries...100%|██████████████████████████████████████████████████| 14321/14321 [00:02<00:00, 5517.26it/s]


In [61]:
utility_df.head()

Unnamed: 0,company_id,parent_name,parent_lei,ticker,isin,utility_name,utility_id_ferc1,utility_id_ferc1_dbf,utility_id_ferc1_xbrl,utility_id_eia,utility_lei,fraction_owned_utility,entity_type_eia,utility_type_rmi,public_private_unmapped,duplicate_utility_id_eia,country,country_short_name,country_alpha2
0,0,"American Electric Power Co., Inc.",1B4S6S7G0TW5EE83BO58,AEP,US0255371017,AEP Generating Co.,342.0,1.0,C003184,343.0,,1.0,Investor Owned,Independent Power Producer,public,False,USA,united states,us
1,1,Southern Co.,549300FC3G3YU2FBZD92,SO,US8425871071,Alabama Power Co.,294.0,2.0,C001552,195.0,0RL818ELFOHP5JHOFU19,1.0,Investor Owned,Vertically Integrated,public,False,USA,united states,us
2,2,Avista Corp.,Q0IK63NITJD6RJ47SW96,AVA,US05379B1070,Alaska Electric Light & Power Co.,394.0,3.0,C011150,213.0,,1.0,Investor Owned,Vertically Integrated,public,False,USA,united states,us
3,3,Alcoa Corp.,549300T12EZ1F6PWWU29,AA,US0138721065,Alcoa Generating Corp.,349.0,4.0,C003448,,,1.0,,Independent Power Producer,public,False,USA,united states,us
4,4,FirstEnergy Corp.,549300SVYJS666PQJH88,FE,US3379321074,The Allegheny Generating Co.,239.0,5.0,C000911,6458.0,,1.0,Retail Power Marketer,Independent Power Producer,public,False,USA,united states,us


## 3. Cleaning LEI & ISIN

In [62]:
# Create an object based on the BankingIdCleaner() class
id_cleaner_obj = BankingIdCleaner()

In [63]:
# Setup cleaning properties
id_cleaner_obj.invalid_ids_as_nan = True
id_cleaner_obj.validation_as_categorical = True
id_cleaner_obj.output_lettercase = "BankingIdCleaner.UPPER_LETTER_CASE"

In [64]:
utility_df.head()

Unnamed: 0,company_id,parent_name,parent_lei,ticker,isin,utility_name,utility_id_ferc1,utility_id_ferc1_dbf,utility_id_ferc1_xbrl,utility_id_eia,utility_lei,fraction_owned_utility,entity_type_eia,utility_type_rmi,public_private_unmapped,duplicate_utility_id_eia,country,country_short_name,country_alpha2
0,0,"American Electric Power Co., Inc.",1B4S6S7G0TW5EE83BO58,AEP,US0255371017,AEP Generating Co.,342.0,1.0,C003184,343.0,,1.0,Investor Owned,Independent Power Producer,public,False,USA,united states,us
1,1,Southern Co.,549300FC3G3YU2FBZD92,SO,US8425871071,Alabama Power Co.,294.0,2.0,C001552,195.0,0RL818ELFOHP5JHOFU19,1.0,Investor Owned,Vertically Integrated,public,False,USA,united states,us
2,2,Avista Corp.,Q0IK63NITJD6RJ47SW96,AVA,US05379B1070,Alaska Electric Light & Power Co.,394.0,3.0,C011150,213.0,,1.0,Investor Owned,Vertically Integrated,public,False,USA,united states,us
3,3,Alcoa Corp.,549300T12EZ1F6PWWU29,AA,US0138721065,Alcoa Generating Corp.,349.0,4.0,C003448,,,1.0,,Independent Power Producer,public,False,USA,united states,us
4,4,FirstEnergy Corp.,549300SVYJS666PQJH88,FE,US3379321074,The Allegheny Generating Co.,239.0,5.0,C000911,6458.0,,1.0,Retail Power Marketer,Independent Power Producer,public,False,USA,united states,us


In [65]:
utility_df = id_cleaner_obj.clean_df(utility_df, cols=['parent_lei','isin','utility_lei'], 
                                     remove_cols= True, 
                                     output_names_as= 'suffix',
                                     types = ['lei','isin','lei'])

Normalizing IDs...100%|██████████████████████████████████████████████████| 14321/14321 [00:06<00:00, 2175.26it/s]


In [66]:
utility_df.head()

Unnamed: 0,company_id,parent_name,ticker,utility_name,utility_id_ferc1,utility_id_ferc1_dbf,utility_id_ferc1_xbrl,utility_id_eia,fraction_owned_utility,entity_type_eia,...,duplicate_utility_id_eia,country,country_short_name,country_alpha2,parent_lei_cleaned_id,parent_lei_isvalid_id,isin_cleaned_id,isin_isvalid_id,utility_lei_cleaned_id,utility_lei_isvalid_id
0,0,"American Electric Power Co., Inc.",AEP,AEP Generating Co.,342.0,1.0,C003184,343.0,1.0,Investor Owned,...,False,USA,united states,us,1B4S6S7G0TW5EE83BO58,1.0,US0255371017,1.0,,
1,1,Southern Co.,SO,Alabama Power Co.,294.0,2.0,C001552,195.0,1.0,Investor Owned,...,False,USA,united states,us,549300FC3G3YU2FBZD92,1.0,US8425871071,1.0,0RL818ELFOHP5JHOFU19,1.0
2,2,Avista Corp.,AVA,Alaska Electric Light & Power Co.,394.0,3.0,C011150,213.0,1.0,Investor Owned,...,False,USA,united states,us,Q0IK63NITJD6RJ47SW96,1.0,US05379B1070,1.0,,
3,3,Alcoa Corp.,AA,Alcoa Generating Corp.,349.0,4.0,C003448,,1.0,,...,False,USA,united states,us,549300T12EZ1F6PWWU29,1.0,US0138721065,1.0,,
4,4,FirstEnergy Corp.,FE,The Allegheny Generating Co.,239.0,5.0,C000911,6458.0,1.0,Retail Power Marketer,...,False,USA,united states,us,549300SVYJS666PQJH88,1.0,US3379321074,1.0,,


In [67]:
utility_df.drop(['parent_lei_isvalid_id',
                'isin_isvalid_id',
                'utility_lei_isvalid_id'], axis=1, inplace=True)

In [68]:
utility_df.head()

Unnamed: 0,company_id,parent_name,ticker,utility_name,utility_id_ferc1,utility_id_ferc1_dbf,utility_id_ferc1_xbrl,utility_id_eia,fraction_owned_utility,entity_type_eia,utility_type_rmi,public_private_unmapped,duplicate_utility_id_eia,country,country_short_name,country_alpha2,parent_lei_cleaned_id,isin_cleaned_id,utility_lei_cleaned_id
0,0,"American Electric Power Co., Inc.",AEP,AEP Generating Co.,342.0,1.0,C003184,343.0,1.0,Investor Owned,Independent Power Producer,public,False,USA,united states,us,1B4S6S7G0TW5EE83BO58,US0255371017,
1,1,Southern Co.,SO,Alabama Power Co.,294.0,2.0,C001552,195.0,1.0,Investor Owned,Vertically Integrated,public,False,USA,united states,us,549300FC3G3YU2FBZD92,US8425871071,0RL818ELFOHP5JHOFU19
2,2,Avista Corp.,AVA,Alaska Electric Light & Power Co.,394.0,3.0,C011150,213.0,1.0,Investor Owned,Vertically Integrated,public,False,USA,united states,us,Q0IK63NITJD6RJ47SW96,US05379B1070,
3,3,Alcoa Corp.,AA,Alcoa Generating Corp.,349.0,4.0,C003448,,1.0,,Independent Power Producer,public,False,USA,united states,us,549300T12EZ1F6PWWU29,US0138721065,
4,4,FirstEnergy Corp.,FE,The Allegheny Generating Co.,239.0,5.0,C000911,6458.0,1.0,Retail Power Marketer,Independent Power Producer,public,False,USA,united states,us,549300SVYJS666PQJH88,US3379321074,


## 4. Cleaning Company's names

In [69]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [70]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                                'remove_words_in_asterisk', 
                                                'remove_words_in_parentheses',
                                                'remove_question_marks_in_parentheses', 
                                                'replace_hyphen_by_space', 
                                                'replace_underscore_by_space', 
                                                'remove_text_puctuation_except_dot', 
                                                'remove_math_symbols', 
                                                'remove_parentheses', 
                                                'remove_brackets', 
                                                'remove_curly_brackets', 
                                                'remove_single_quote_next_character', 
                                                'remove_double_quote', 
                                                'enforce_single_space_between_words']
company_cleaner_obj.post_cleaning_rules = ['remove_all_punctuation']

In [71]:
utility_df.head()

Unnamed: 0,company_id,parent_name,ticker,utility_name,utility_id_ferc1,utility_id_ferc1_dbf,utility_id_ferc1_xbrl,utility_id_eia,fraction_owned_utility,entity_type_eia,utility_type_rmi,public_private_unmapped,duplicate_utility_id_eia,country,country_short_name,country_alpha2,parent_lei_cleaned_id,isin_cleaned_id,utility_lei_cleaned_id
0,0,"American Electric Power Co., Inc.",AEP,AEP Generating Co.,342.0,1.0,C003184,343.0,1.0,Investor Owned,Independent Power Producer,public,False,USA,united states,us,1B4S6S7G0TW5EE83BO58,US0255371017,
1,1,Southern Co.,SO,Alabama Power Co.,294.0,2.0,C001552,195.0,1.0,Investor Owned,Vertically Integrated,public,False,USA,united states,us,549300FC3G3YU2FBZD92,US8425871071,0RL818ELFOHP5JHOFU19
2,2,Avista Corp.,AVA,Alaska Electric Light & Power Co.,394.0,3.0,C011150,213.0,1.0,Investor Owned,Vertically Integrated,public,False,USA,united states,us,Q0IK63NITJD6RJ47SW96,US05379B1070,
3,3,Alcoa Corp.,AA,Alcoa Generating Corp.,349.0,4.0,C003448,,1.0,,Independent Power Producer,public,False,USA,united states,us,549300T12EZ1F6PWWU29,US0138721065,
4,4,FirstEnergy Corp.,FE,The Allegheny Generating Co.,239.0,5.0,C000911,6458.0,1.0,Retail Power Marketer,Independent Power Producer,public,False,USA,united states,us,549300SVYJS666PQJH88,US3379321074,


In [72]:
utility_df = company_cleaner_obj.clean_df(utility_df,
                                            'parent_name',
                                            'parent_name_clean',
                                            'country_alpha2',
                                            'True')

In [73]:
utility_df = company_cleaner_obj.clean_df(utility_df,
                                            'utility_name',
                                            'utility_name_clean',
                                            'country_alpha2',
                                            'True')

In [74]:
utility_df.head()

Unnamed: 0,company_id,parent_name,ticker,utility_name,utility_id_ferc1,utility_id_ferc1_dbf,utility_id_ferc1_xbrl,utility_id_eia,fraction_owned_utility,entity_type_eia,...,public_private_unmapped,duplicate_utility_id_eia,country,country_short_name,country_alpha2,parent_lei_cleaned_id,isin_cleaned_id,utility_lei_cleaned_id,parent_name_clean,utility_name_clean
0,0,"American Electric Power Co., Inc.",AEP,AEP Generating Co.,342.0,1.0,C003184,343.0,1.0,Investor Owned,...,public,False,USA,united states,us,1B4S6S7G0TW5EE83BO58,US0255371017,,american electric power co incorporated,aep generating company
1,1,Southern Co.,SO,Alabama Power Co.,294.0,2.0,C001552,195.0,1.0,Investor Owned,...,public,False,USA,united states,us,549300FC3G3YU2FBZD92,US8425871071,0RL818ELFOHP5JHOFU19,southern company,alabama power company
2,2,Avista Corp.,AVA,Alaska Electric Light & Power Co.,394.0,3.0,C011150,213.0,1.0,Investor Owned,...,public,False,USA,united states,us,Q0IK63NITJD6RJ47SW96,US05379B1070,,avista corporation,alaska electric light power company
3,3,Alcoa Corp.,AA,Alcoa Generating Corp.,349.0,4.0,C003448,,1.0,,...,public,False,USA,united states,us,549300T12EZ1F6PWWU29,US0138721065,,alcoa corporation,alcoa generating corporation
4,4,FirstEnergy Corp.,FE,The Allegheny Generating Co.,239.0,5.0,C000911,6458.0,1.0,Retail Power Marketer,...,public,False,USA,united states,us,549300SVYJS666PQJH88,US3379321074,,firstenergy corporation,the allegheny generating company


## 5. Check and save

In [75]:
print(f'Total company in RMI : {utility_df.shape[0]}')

Total company in RMI : 14321


In [None]:
saved_path = "../../../dataset/cleaned/"
filename = "utilities_cleaned.csv"

utility_filename = os.path.join(saved_path, filename)
utility_df.to_csv(utility_filename,encoding='utf-8',header=True, index=False)

In [50]:
s3_filename = 'RMI/cleaned/utilities_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=utility_filename,
                                    Bucket=env_var['S3_BUCKET'], 
                                    Key=s3_filename)