# RMI Pre-processing

....

In [10]:
import pandas as pd
import os
pd.set_option('display.max_columns',50)
from dotenv import dotenv_values, load_dotenv

## 1. Set up to read RMI from the S3

In [11]:
env_var = dotenv_values('../../.env')

In [12]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=env_var['S3_ENDPOINT'],
    aws_access_key_id=env_var['S3_ACCESS_KEY'],
    aws_secret_access_key=env_var['S3_SECRET_KEY'],
)
bucket_name = env_var['S3_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

## 2. EDA

* assets_earnings  => detailed breakdown of utility assets in electric rate base, and earnings on these assets.

* operations_emissions => capacity, generation, capacity factor, and emissions of CO2, NOx, and SOx for each portion of each plant owned by each utility.

* emissions_targets => CO2 emissions and projections, as well as electricity generation and projections and comparison to RMI's 1.5C decarbonization pathway for the US electricity sector.

* customer_bills => detailed breakdown of electric revenues and the average residential customer bill for each utility.

* utility_information => utility identifiers such as name, ID numbers from various sources, and utility type.

#### a. Asset earning
* 43602 rows | 11 col
* 274 uniques id

In [15]:
assets_earnings = bucket.Object('RMI/RMI Utility Transition Hub Data/assets_earnings.csv').get()['Body']
asset_df = pd.read_csv(assets_earnings, encoding='utf-8', delimiter=',', low_memory=False)

In [16]:
asset_df.shape

(43602, 11)

In [17]:
asset_df.head()

Unnamed: 0,parent_company,utility_name,respondent_id,year,asset,sub_asset,asset_value,equity_ratio,ROE,ROR,earnings_value
0,"American Electric Power Co., Inc.",AEP Generating Co.,1,2005,other,AROs,-1370143.0,0.529609,0.0,0.040655,-0.0
1,"American Electric Power Co., Inc.",AEP Generating Co.,1,2005,other,electric_plant_held_for_future_use,1112267.0,0.529609,0.0,0.040655,0.0
2,"American Electric Power Co., Inc.",AEP Generating Co.,1,2005,other,electric_plant_leased_to_others,12227414.0,0.529609,0.0,0.040655,0.0
3,"American Electric Power Co., Inc.",AEP Generating Co.,1,2005,other,general_plant,646114.0,0.529609,0.0,0.040655,0.0
4,"American Electric Power Co., Inc.",AEP Generating Co.,1,2005,other,intangible_plant,1155941.0,0.529609,0.0,0.040655,0.0


In [18]:
asset_df['respondent_id'].nunique()

274

#### b. Emissions targets
* 10061 rows | 15 cols
* 192 unique id

In [19]:
emission_tgt = bucket.Object('RMI/RMI Utility Transition Hub Data/emissions_targets.csv').get()['Body']
emission_tgt_df = pd.read_csv(emission_tgt, encoding='utf-8', delimiter=',', low_memory=False)

In [20]:
emission_tgt_df.shape

(10061, 15)

In [21]:
emission_tgt_df.head()

Unnamed: 0,parent_company,utility_name,respondent_id,year,CO2_historical,CO2_target,CO2_target_all_years,CO2_1point5C,generation_historical,generation_projected,generation_1point5C,CO2_intensity_historical,CO2_intensity_target,CO2_intensity_target_all_years,CO2_intensity_1point5C
0,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2000.0,0.324787,,,4.034557,7.276847,,3.555941,0.044633,,,1.134596
1,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2001.0,0.0,,,3.96904,1.0,,3.494718,0.0,,,1.135725
2,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2002.0,0.0,,,3.995914,1.0,,3.60864,0.0,,,1.107318
3,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2003.0,0.0,,,4.050336,1.0,,3.631772,0.0,,,1.115251
4,"American Electric Power Co., Inc.",AEP Generating Co.,1.0,2004.0,0.0,,,4.104741,1.0,,3.713485,0.0,,,1.105361


In [22]:
emission_tgt_df['respondent_id'].nunique()

192

#### c. Operations emissions
* 352072rows | 23 col
* 199 respondent_id unique
* longitude and latitude filled at 94%

In [23]:
emission_op = bucket.Object('RMI/RMI Utility Transition Hub Data/operations_emissions.csv').get()['Body']
emission_op_df = pd.read_csv(emission_op, encoding='utf-8', delimiter=',', low_memory=False)

In [24]:
emission_op_df.shape

(352072, 23)

In [25]:
emission_op_df.head()

Unnamed: 0,parent_company,utility_name,respondent_id,plant_id_eia,generator_id,owned_or_total,Latitude,Longitude,state,NERC Region,Balancing Authority Code,Balancing Authority Name,year,status,technology_EIA,technology_RMI,fuel_type_code,fuel_type_category,capacity,generation,potential_generation,capacity_factor,emissions_CO2
0,"American Electric Power Co., Inc.",AEP Generating Co.,1,6166.0,1.0,owned,,,IN,,,,2019.0,OP,Conventional Steam Coal,Steam,SUB,coal,0.455,1.173559,3.9858,0.294435,1.171257
1,"American Electric Power Co., Inc.",AEP Generating Co.,1,6166.0,1.0,owned,,,IN,,,,2019.0,OP,Conventional Steam Coal,Steam,BIT,coal,0.455,0.226424,3.9858,0.056808,0.21695
2,"American Electric Power Co., Inc.",AEP Generating Co.,1,6166.0,2.0,owned,,,IN,,,,2019.0,OP,Conventional Steam Coal,Steam,SUB,coal,0.455,1.216699,3.9858,0.305259,1.214313
3,"American Electric Power Co., Inc.",AEP Generating Co.,1,6166.0,2.0,owned,,,IN,,,,2019.0,OP,Conventional Steam Coal,Steam,BIT,coal,0.455,0.234748,3.9858,0.058896,0.224925
4,"American Electric Power Co., Inc.",AEP Generating Co.,1,,,total,,,,,,,2019.0,OP,Energy Efficiency,EE and DR,,,,,,,


In [26]:
emission_op_df[~emission_op_df['Latitude'].isna()]

Unnamed: 0,parent_company,utility_name,respondent_id,plant_id_eia,generator_id,owned_or_total,Latitude,Longitude,state,NERC Region,Balancing Authority Code,Balancing Authority Name,year,status,technology_EIA,technology_RMI,fuel_type_code,fuel_type_category,capacity,generation,potential_generation,capacity_factor,emissions_CO2
31,"American Electric Power Co., Inc.",AEP Generating Co.,1,55502.0,0100,owned,39.0911,-84.8669,IN,RFC,PJM,"PJM Interconnection, LLC",2016.0,OP,Natural Gas Fired Combined Cycle,Other Fossil,NG,gas,0.2680,1.246452,2.354112,0.529479,0.000000
32,"American Electric Power Co., Inc.",AEP Generating Co.,1,55502.0,0100,owned,39.0911,-84.8669,IN,RFC,PJM,"PJM Interconnection, LLC",2016.0,OP,Natural Gas Fired Combined Cycle,Other Fossil,NG,gas,0.2680,1.246452,2.354112,0.529479,0.000000
33,"American Electric Power Co., Inc.",AEP Generating Co.,1,55502.0,0100,owned,39.0911,-84.8669,IN,RFC,PJM,"PJM Interconnection, LLC",2016.0,OP,Natural Gas Fired Combined Cycle,Other Fossil,NG,gas,0.2680,1.246452,2.354112,0.529479,0.000000
34,"American Electric Power Co., Inc.",AEP Generating Co.,1,55502.0,0100,owned,39.0911,-84.8669,IN,RFC,PJM,"PJM Interconnection, LLC",2016.0,OP,Natural Gas Fired Combined Cycle,Other Fossil,NG,gas,0.2680,1.246452,2.354112,0.529479,0.000000
35,"American Electric Power Co., Inc.",AEP Generating Co.,1,55502.0,0200,owned,39.0911,-84.8669,IN,RFC,PJM,"PJM Interconnection, LLC",2016.0,OP,Natural Gas Fired Combined Cycle,Other Fossil,NG,gas,0.2680,1.246452,2.354112,0.529479,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352062,Basin Electric Power Coop.,Basin Electric Power Cooperative,531,8030.0,3,owned,43.7219,-105.7689,WY,WECC,WACM,Western Area Power Administration - Rocky Moun...,2005.0,OP,Natural Gas Fired Combustion Turbine,Other Fossil,NG,gas,0.0075,0.000051,0.065700,0.000776,0.000039
352063,Basin Electric Power Coop.,Basin Electric Power Cooperative,531,55995.0,MWP,owned,48.021271,-101.280517,ND,MRO,MISO,Midcontinent Independent Transmission System O...,2005.0,OP,Onshore Wind Turbine,Renewables and Storage,WND,wind,0.0026,0.005251,0.022776,0.230550,0.000000
352064,Basin Electric Power Coop.,Basin Electric Power Cooperative,531,55995.0,MWP,owned,48.021271,-101.280517,ND,MRO,MISO,Midcontinent Independent Transmission System O...,2005.0,OP,Onshore Wind Turbine,Renewables and Storage,WND,wind,0.0026,0.005251,0.022776,0.230550,0.000000
352065,Basin Electric Power Coop.,Basin Electric Power Cooperative,531,55995.0,MWP,owned,48.021271,-101.280517,ND,MRO,SWPP,Southwest Power Pool,2005.0,OP,Onshore Wind Turbine,Renewables and Storage,WND,wind,0.0026,0.005251,0.022776,0.230550,0.000000


In [27]:
emission_op_df['respondent_id'].nunique()

199

#### d. Revenues bills

In [28]:
revenues = bucket.Object('RMI/RMI Utility Transition Hub Data/revenues_bills.csv').get()['Body']
revenues_df = pd.read_csv(revenues, encoding='utf-8', delimiter=',', low_memory=False)

In [29]:
revenues_df.shape

(96144, 11)

In [30]:
revenues_df.head()

Unnamed: 0,parent_company,utility_name,respondent_id,year,revenue_component,revenue_sub_component,revenue_value,sales_residential,sales_total,customers_residential,residential_bill_value
0,"American Electric Power Co., Inc.",AEP Generating Co.,1,2019.0,other,depreciation_expense,19045.0,,306947627.0,,
1,"American Electric Power Co., Inc.",AEP Generating Co.,1,2019.0,other,maintenance_expenses,420810.0,,306947627.0,,
2,"American Electric Power Co., Inc.",AEP Generating Co.,1,2019.0,other,operation_expenses,2847277.0,,306947627.0,,
3,"American Electric Power Co., Inc.",AEP Generating Co.,1,2019.0,other,returns,7234716.0,,306947627.0,,
4,"American Electric Power Co., Inc.",AEP Generating Co.,1,2019.0,steam,depreciation_expense,50435490.0,,306947627.0,,


In [31]:
revenues_df['respondent_id'].nunique()

339

In [32]:
revenues_df['utility_name'].nunique()

336

#### e. Utility information
* all the ids

In [52]:
utility = bucket.Object('RMI/RMI Utility Transition Hub Data/utility_information.csv').get()['Body']
utility_df = pd.read_csv(utility, encoding='utf-8', delimiter=',', low_memory=False)

In [34]:
utility_df.shape

(375, 12)

In [35]:
utility_df.head()

Unnamed: 0,parent_company,parent_ticker,parent_ISIN,parent_LEI,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_id,parent_name,ticker
0,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,10.0,"American Electric Power Co., Inc.",AEP
1,Southern Co.,SO,US8425871071,549300FC3G3YU2FBZD92,Alabama Power Co.,2,195.0,Investor-Owned Utility,Vertically Integrated,142.0,Southern Co.,SO
2,Avista Corp.,AVA,US05379B1070,Q0IK63NITJD6RJ47SW96,Alaska Electric Light & Power Co.,3,213.0,Investor-Owned Utility,Vertically Integrated,16.0,Avista Corp.,AVA
3,Alcoa Corp.,AA,US0138721065,549300T12EZ1F6PWWU29,Alcoa Generating Corp.,4,,Industrial,Other,4.0,Alcoa Corp.,AA
4,FirstEnergy Corp.,FE,US3379321074,549300SVYJS666PQJH88,The Allegheny Generating Co.,5,6458.0,Investor-Owned Utility,Independent Power Producer,61.0,FirstEnergy Corp.,FE


#### f. Data dictionnary

In [36]:
data_dict = bucket.Object('RMI/RMI Utility Transition Hub Data/RMI Utility Transition Hub Data Dictionary.xlsx').get()['Body']

In [None]:
s3 = boto3.client('s3')
s3.download_file(env_var['S3_BUCKET'],
                'RMI/RMI Utility Transition Hub Data/RMI Utility Transition Hub Data Dictionary.xlsx',
                '../../../dico.xlsx')

## 3. Preprocessing 

* In utility we have duplicate with the utility_name 
* RMI cover the USA so we can create a country column set with 'USA'

In [53]:
## Add the country
utility_df['country'] = 'USA'

In [20]:
utility_df.query('parent_company == "American Electric Power Co., Inc."').head()

Unnamed: 0,parent_company,parent_ticker,parent_ISIN,parent_LEI,utility_name,respondent_id,utility_id_eia,entity_type_EIA,utility_type_RMI,parent_id,parent_name,ticker,country
0,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,AEP Generating Co.,1,343.0,Investor-Owned Utility,Independent Power Producer,10.0,"American Electric Power Co., Inc.",AEP,USA
5,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,Appalachian Power Co.,6,733.0,Investor-Owned Utility,Vertically Integrated,10.0,"American Electric Power Co., Inc.",AEP,USA
23,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,AEP Texas Central Co.,24,3278.0,Investor-Owned Utility,Wires Only,10.0,"American Electric Power Co., Inc.",AEP,USA
30,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,Columbus Southern Power Co.,31,4062.0,Investor-Owned Utility,Restructured,10.0,"American Electric Power Co., Inc.",AEP,USA
72,"American Electric Power Co., Inc.",AEP,US0255371017,1B4S6S7G0TW5EE83BO58,Indiana Michigan Power Co.,73,9324.0,Investor-Owned Utility,Vertically Integrated,10.0,"American Electric Power Co., Inc.",AEP,USA


In [54]:
dup_key = ['parent_company','parent_LEI',"utility_name"]
utility_df.drop_duplicates(dup_key, inplace=True)

In [1]:
### Create Folder
#s3 = boto3.client('s3')
#bucket_name = env_var['S3_BUCKET']
#folder_name = "RMI/raw"
#
#s3.put_object(Bucket=bucket_name, Key=(folder_name+'/'))

In [55]:
# Save localy
saved_path = "../../../dataset/pre_processed/"
filename = "utilities_pre_processed.csv"

utility_filename = os.path.join(saved_path, filename)
utility_df.to_csv(utility_filename,encoding='utf-8',header=True, index=False)

In [15]:
for obj in bucket.objects.filter(Prefix="CorpWatch/"):
    print(obj.key)

CorpWatch/
CorpWatch/README.txt
CorpWatch/cleaned/
CorpWatch/cleaned/corpwatch_cleaned.csv
CorpWatch/corpwatch_api_tables_csv.tar.gz
CorpWatch/pre_processed/
CorpWatch/pre_processed/corpwatch_pre_processed.csv
CorpWatch/raw/
CorpWatch/raw/cik_lei.csv
CorpWatch/raw/companies.csv
CorpWatch/raw/company_locations.csv
CorpWatch/raw/company_names.csv


In [None]:
# Upload
s3_filename = 'RMI/pre_processed/utilities_pre_processed.csv'
s3_resource.meta.client.upload_file(Filename=utility_filename,
                                    Bucket=env_var['S3_BUCKET'], 
                                    Key=s3_filename)