# Cleaning RMI pre-processed file

...

In [1]:
import pandas as pd
import os
from dotenv import dotenv_values, load_dotenv

# Import CountryCleaner
from financial_entity_cleaner.location import CountryCleaner

# Import the module for cleaning company's name
from financial_entity_cleaner.company import CompanyNameCleaner

# Import BankingIdCleaner
from financial_entity_cleaner.id import BankingIdCleaner

In [2]:
env_var = dotenv_values('../../.env')

In [3]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=env_var['S3_ENDPOINT'],
    aws_access_key_id=env_var['S3_ACCESS_KEY'],
    aws_secret_access_key=env_var['S3_SECRET_KEY'],
)
bucket_name = env_var['S3_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 1. Read the RMI pre-processed file

In [4]:
# Save locally to "dataset" folder
saved_path = "../../../dataset/pre_processed/"
filename = "utilities_pre_processed.csv"

In [5]:
utility_df = pd.read_csv(os.path.join(saved_path, filename), low_memory=False)

In [6]:
utility_df.head()

Unnamed: 0,parent_company,parent_ticker,parent_ISIN,parent_LEI,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_id,parent_name,ticker,country
0,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,10.0,"American Electric Power Co., Inc.",AEP,USA
1,Southern Co.,SO,US8425871071,549300FC3G3YU2FBZD92,Alabama Power Co.,2,195.0,Investor-Owned Utility,Vertically Integrated,142.0,Southern Co.,SO,USA
2,Avista Corp.,AVA,US05379B1070,Q0IK63NITJD6RJ47SW96,Alaska Electric Light & Power Co.,3,213.0,Investor-Owned Utility,Vertically Integrated,16.0,Avista Corp.,AVA,USA
3,Alcoa Corp.,AA,US0138721065,549300T12EZ1F6PWWU29,Alcoa Generating Corp.,4,,Industrial,Other,4.0,Alcoa Corp.,AA,USA
4,FirstEnergy Corp.,FE,US3379321074,549300SVYJS666PQJH88,The Allegheny Generating Co.,5,6458.0,Investor-Owned Utility,Independent Power Producer,61.0,FirstEnergy Corp.,FE,USA


## 2. Cleaning country information

In [7]:
country_cleaner_obj=CountryCleaner()

In [8]:
country_cleaner_obj.letter_case='lower'
country_cleaner_obj.output_info = [CountryCleaner.ATTRIBUTE_ALPHA2,
                                   CountryCleaner.ATTRIBUTE_SHORT_NAME]

In [9]:
utility_df = country_cleaner_obj.clean_df(df=utility_df, 
                                            cols=['country'],
                                            output_names_as= 'suffix')

Normalizing countries...100%|██████████████████████████████████████████████████| 370/370 [00:00<00:00, 2041.28it/s]


In [10]:
utility_df.head()

Unnamed: 0,parent_company,parent_ticker,parent_ISIN,parent_LEI,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_id,parent_name,ticker,country,country_short_name,country_alpha2
0,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,10.0,"American Electric Power Co., Inc.",AEP,USA,united states,us
1,Southern Co.,SO,US8425871071,549300FC3G3YU2FBZD92,Alabama Power Co.,2,195.0,Investor-Owned Utility,Vertically Integrated,142.0,Southern Co.,SO,USA,united states,us
2,Avista Corp.,AVA,US05379B1070,Q0IK63NITJD6RJ47SW96,Alaska Electric Light & Power Co.,3,213.0,Investor-Owned Utility,Vertically Integrated,16.0,Avista Corp.,AVA,USA,united states,us
3,Alcoa Corp.,AA,US0138721065,549300T12EZ1F6PWWU29,Alcoa Generating Corp.,4,,Industrial,Other,4.0,Alcoa Corp.,AA,USA,united states,us
4,FirstEnergy Corp.,FE,US3379321074,549300SVYJS666PQJH88,The Allegheny Generating Co.,5,6458.0,Investor-Owned Utility,Independent Power Producer,61.0,FirstEnergy Corp.,FE,USA,united states,us


## 3. Cleaning LEI & ISIN

In [11]:
# Create an object based on the BankingIdCleaner() class
id_cleaner_obj = BankingIdCleaner()

In [12]:
# Setup cleaning properties
id_cleaner_obj.invalid_ids_as_nan = True
id_cleaner_obj.validation_as_categorical = True
id_cleaner_obj.output_lettercase = "BankingIdCleaner.UPPER_LETTER_CASE"

In [13]:
utility_df.head()

Unnamed: 0,parent_company,parent_ticker,parent_ISIN,parent_LEI,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_id,parent_name,ticker,country,country_short_name,country_alpha2
0,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,10.0,"American Electric Power Co., Inc.",AEP,USA,united states,us
1,Southern Co.,SO,US8425871071,549300FC3G3YU2FBZD92,Alabama Power Co.,2,195.0,Investor-Owned Utility,Vertically Integrated,142.0,Southern Co.,SO,USA,united states,us
2,Avista Corp.,AVA,US05379B1070,Q0IK63NITJD6RJ47SW96,Alaska Electric Light & Power Co.,3,213.0,Investor-Owned Utility,Vertically Integrated,16.0,Avista Corp.,AVA,USA,united states,us
3,Alcoa Corp.,AA,US0138721065,549300T12EZ1F6PWWU29,Alcoa Generating Corp.,4,,Industrial,Other,4.0,Alcoa Corp.,AA,USA,united states,us
4,FirstEnergy Corp.,FE,US3379321074,549300SVYJS666PQJH88,The Allegheny Generating Co.,5,6458.0,Investor-Owned Utility,Independent Power Producer,61.0,FirstEnergy Corp.,FE,USA,united states,us


In [14]:
utility_df = id_cleaner_obj.clean_df(utility_df, cols=['parent_LEI','parent_ISIN'], 
                                     remove_cols= True, 
                                     output_names_as= 'suffix',
                                     types = ['lei','isin'])

Normalizing IDs...100%|██████████████████████████████████████████████████| 370/370 [00:00<00:00, 2357.99it/s]


In [15]:
utility_df.head()

Unnamed: 0,parent_company,parent_ticker,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_id,parent_name,ticker,country,country_short_name,country_alpha2,parent_LEI_cleaned_id,parent_LEI_isvalid_id,parent_ISIN_cleaned_id,parent_ISIN_isvalid_id
0,"American Electric Power Co., Inc.",AEP,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,10.0,"American Electric Power Co., Inc.",AEP,USA,united states,us,1B4S6S7G0TW5EE83BO58,1.0,US0255371017,1.0
1,Southern Co.,SO,Alabama Power Co.,2,195.0,Investor-Owned Utility,Vertically Integrated,142.0,Southern Co.,SO,USA,united states,us,549300FC3G3YU2FBZD92,1.0,US8425871071,1.0
2,Avista Corp.,AVA,Alaska Electric Light & Power Co.,3,213.0,Investor-Owned Utility,Vertically Integrated,16.0,Avista Corp.,AVA,USA,united states,us,Q0IK63NITJD6RJ47SW96,1.0,US05379B1070,1.0
3,Alcoa Corp.,AA,Alcoa Generating Corp.,4,,Industrial,Other,4.0,Alcoa Corp.,AA,USA,united states,us,549300T12EZ1F6PWWU29,1.0,US0138721065,1.0
4,FirstEnergy Corp.,FE,The Allegheny Generating Co.,5,6458.0,Investor-Owned Utility,Independent Power Producer,61.0,FirstEnergy Corp.,FE,USA,united states,us,549300SVYJS666PQJH88,1.0,US3379321074,1.0


In [16]:
utility_df.drop(['parent_LEI_isvalid_id',
                'parent_ISIN_isvalid_id',
                'parent_id'], axis=1, inplace=True)

In [17]:
utility_df.head()

Unnamed: 0,parent_company,parent_ticker,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_name,ticker,country,country_short_name,country_alpha2,parent_LEI_cleaned_id,parent_ISIN_cleaned_id
0,"American Electric Power Co., Inc.",AEP,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,"American Electric Power Co., Inc.",AEP,USA,united states,us,1B4S6S7G0TW5EE83BO58,US0255371017
1,Southern Co.,SO,Alabama Power Co.,2,195.0,Investor-Owned Utility,Vertically Integrated,Southern Co.,SO,USA,united states,us,549300FC3G3YU2FBZD92,US8425871071
2,Avista Corp.,AVA,Alaska Electric Light & Power Co.,3,213.0,Investor-Owned Utility,Vertically Integrated,Avista Corp.,AVA,USA,united states,us,Q0IK63NITJD6RJ47SW96,US05379B1070
3,Alcoa Corp.,AA,Alcoa Generating Corp.,4,,Industrial,Other,Alcoa Corp.,AA,USA,united states,us,549300T12EZ1F6PWWU29,US0138721065
4,FirstEnergy Corp.,FE,The Allegheny Generating Co.,5,6458.0,Investor-Owned Utility,Independent Power Producer,FirstEnergy Corp.,FE,USA,united states,us,549300SVYJS666PQJH88,US3379321074


## 4. Cleaning Company's names

In [18]:
# Create a CompanyNameCleaner object
company_cleaner_obj = CompanyNameCleaner()

In [19]:
# Set cleaner properties
company_cleaner_obj.normalize_legal_terms = True
company_cleaner_obj.letter_case="lower"
company_cleaner_obj.remove_accents = True
company_cleaner_obj.default_cleaning_rules = ['place_word_the_at_the_beginning',
                                                'remove_words_in_asterisk', 
                                                'remove_words_in_parentheses',
                                                'remove_question_marks_in_parentheses', 
                                                'replace_hyphen_by_space', 
                                                'replace_underscore_by_space', 
                                                'remove_text_puctuation_except_dot', 
                                                'remove_math_symbols', 
                                                'remove_parentheses', 
                                                'remove_brackets', 
                                                'remove_curly_brackets', 
                                                'remove_single_quote_next_character', 
                                                'remove_double_quote', 
                                                'enforce_single_space_between_words']
company_cleaner_obj.post_cleaning_rules = ['remove_all_punctuation']

In [20]:
utility_df.head()

Unnamed: 0,parent_company,parent_ticker,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_name,ticker,country,country_short_name,country_alpha2,parent_LEI_cleaned_id,parent_ISIN_cleaned_id
0,"American Electric Power Co., Inc.",AEP,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,"American Electric Power Co., Inc.",AEP,USA,united states,us,1B4S6S7G0TW5EE83BO58,US0255371017
1,Southern Co.,SO,Alabama Power Co.,2,195.0,Investor-Owned Utility,Vertically Integrated,Southern Co.,SO,USA,united states,us,549300FC3G3YU2FBZD92,US8425871071
2,Avista Corp.,AVA,Alaska Electric Light & Power Co.,3,213.0,Investor-Owned Utility,Vertically Integrated,Avista Corp.,AVA,USA,united states,us,Q0IK63NITJD6RJ47SW96,US05379B1070
3,Alcoa Corp.,AA,Alcoa Generating Corp.,4,,Industrial,Other,Alcoa Corp.,AA,USA,united states,us,549300T12EZ1F6PWWU29,US0138721065
4,FirstEnergy Corp.,FE,The Allegheny Generating Co.,5,6458.0,Investor-Owned Utility,Independent Power Producer,FirstEnergy Corp.,FE,USA,united states,us,549300SVYJS666PQJH88,US3379321074


In [21]:
utility_df = company_cleaner_obj.clean_df(utility_df,
                                            'parent_company',
                                            'parent_company_clean',
                                            'country_alpha2',
                                            'True')

In [22]:
utility_df = company_cleaner_obj.clean_df(utility_df,
                                            'utility_name',
                                            'utility_name_clean',
                                            'country_alpha2',
                                            'True')

In [23]:
utility_df.head()

Unnamed: 0,parent_company,parent_ticker,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_name,ticker,country,country_short_name,country_alpha2,parent_LEI_cleaned_id,parent_ISIN_cleaned_id,parent_company_clean,utility_name_clean
0,"American Electric Power Co., Inc.",AEP,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,"American Electric Power Co., Inc.",AEP,USA,united states,us,1B4S6S7G0TW5EE83BO58,US0255371017,american electric power co incorporated,aep generating company
1,Southern Co.,SO,Alabama Power Co.,2,195.0,Investor-Owned Utility,Vertically Integrated,Southern Co.,SO,USA,united states,us,549300FC3G3YU2FBZD92,US8425871071,southern company,alabama power company
2,Avista Corp.,AVA,Alaska Electric Light & Power Co.,3,213.0,Investor-Owned Utility,Vertically Integrated,Avista Corp.,AVA,USA,united states,us,Q0IK63NITJD6RJ47SW96,US05379B1070,avista corporation,alaska electric light power company
3,Alcoa Corp.,AA,Alcoa Generating Corp.,4,,Industrial,Other,Alcoa Corp.,AA,USA,united states,us,549300T12EZ1F6PWWU29,US0138721065,alcoa corporation,alcoa generating corporation
4,FirstEnergy Corp.,FE,The Allegheny Generating Co.,5,6458.0,Investor-Owned Utility,Independent Power Producer,FirstEnergy Corp.,FE,USA,united states,us,549300SVYJS666PQJH88,US3379321074,firstenergy corporation,the allegheny generating company


## 5. Check and save

In [24]:
print(f'Total company in RMI : {utility_df.shape[0]}')

Total company in RMI : 370


In [25]:
saved_path = "../../../dataset/cleaned/"
filename = "utilities_cleaned.csv"

utility_filename = os.path.join(saved_path, filename)
utility_df.to_csv(utility_filename,encoding='utf-8',header=True, index=False)

In [26]:
s3_filename = 'RMI/cleaned/utilities_cleaned.csv'
s3_resource.meta.client.upload_file(Filename=utility_filename,
                                    Bucket=env_var['S3_BUCKET'], 
                                    Key=s3_filename)