# ITR Data Pipeline

* Global Parameters
* Industry Data (Sector Projections)

* Portfolio Data
* Company Data
* Automization
* Temperature Scoring

## Environment variables and dot-env

The following cell looks for a "dot-env" file in some standard locations,
and loads its contents into `os.environ`.

In [None]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

## S3 and boto3

In [None]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

In [None]:
import osc_ingest_trino as osc

s3 = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ["S3_DEV_ENDPOINT"],
    aws_access_key_id=os.environ["S3_DEV_ACCESS_KEY"],
    aws_secret_access_key=os.environ["S3_DEV_SECRET_KEY"],
)
trino_bucket = osc.attach_s3_bucket("S3_DEV")

## Connecting to Trino with sqlalchemy

In [None]:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev',
    'schema': 'itr_mdt',
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

ingest_schema = 'itr_mdt'

In [None]:
import numpy as np
import pandas as pd
import io

### Global Parameters

These parameters are set/selected by the ITR tool.

### Industry Data (Sector Projections)

In [None]:
scenarios = {}
for scenario in ['TPI', 'TPI_below_2', 'OECM']:
    df_dict = pd.read_excel(os.environ.get('PWD')+f"/itr-data-pipeline/data/external/{scenario}_EI_and_production_benchmarks{('','_v2')[scenario=='OECM']}.xlsx", sheet_name=None)
    for projtype in ['projected_production', 'projected_ei_in_Wh']:
        df_dict[projtype]['projection'] = projtype
        df_dict[projtype]['scenario'] = scenario
    scenarios[scenario] = pd.concat (df_dict.values())
df = pd.concat(scenarios, ignore_index=True)
cols = df.columns.tolist()
cols = cols[-2:]+cols[0:-2]
df = df[cols]
# display(df)

In [None]:
sector_projections = df.melt(id_vars=cols[0:4], value_vars=cols[4:], var_name='year')
sector_projections

### Portfolio Data

Get RMI LEI/ISIN data

In [None]:
rmi_lei_isin = pd.read_sql('select parent_name, parent_lei, parent_isin from rmi_20211120.utility_information', engine)
rmi_dict = dict(zip(rmi_lei_isin.parent_lei, rmi_lei_isin.parent_isin))
rmi_lei_isin

Prepare GLEIF matching data

In [None]:
gleif_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'mtiemann-GLEIF/DERA-matches.csv')
gleif_file.download_file(f'/tmp/dera-gleif.csv')
gleif_df = pd.read_csv(f'/tmp/dera-gleif.csv', header=0, sep=',', dtype=str, engine='c')
gleif_dict = dict(zip(gleif_df.name, gleif_df.LEI))

In [None]:
gleif_dict['Old Dominion Electric Cooperative'.upper()]

Create a very simple entity matcher

In [None]:
# gleif_dict['Basin Electric Power Coop'.upper()] = gleif_dict['BASIN ELECTRIC POWER COOPERATIVE']
# gleif_dict['Big Rivers Electric Corp'.upper()] = gleif_dict['BIG RIVERS ELECTRIC CORPORATION']
gleif_dict['Cleco Partners LP'.upper()] = gleif_dict['CLECO CORPORATE HOLDINGS LLC']
# gleif_dict['Golden Spread Electric Coop., Inc'.upper()] = gleif_dict['GOLDEN SPREAD ELECTRIC COOPERATIVE, INC.']
gleif_dict['MIDWEST ENERGY INC'] = '549300O4B5CVWMKUES27'
gleif_dict['OG&E Energy'.upper()] = gleif_dict['OGE ENERGY CORP.']
# gleif_dict['Ohio Valley Electric Corp'.upper()] = gleif_dict['OHIO VALLEY ELECTRIC CORPORATION']
gleif_dict['Old Dominion Electric Coop'.upper()] = gleif_dict['OLD DOMINION ELECTRIC COOPERATIVE']
gleif_dict['PG&E Corp'.upper()] = gleif_dict['PG&E CORP']
gleif_dict['Tri-State Generation & Transmission Association'.upper()] = gleif_dict['TRI-STATE GENERATION & TRANSMISSION ASSOCIATION, INC.']

gleif_1 = { k.split(',')[0].split(' ')[0]:v for k,v in gleif_dict.items() }
gleif_2 = { ' '.join(k.split(',')[0].split(' ')[0:2]):v for k,v in gleif_dict.items() }

def gleif_match(x):
    x = x.split(',')[0]
    x = x.replace('.','')
    if x in gleif_dict:
        return gleif_dict[x]
    x2 = ' '.join(x.split(' ')[0:2])
    if x2 in gleif_2:
        return gleif_2[x2]
    if ' ' not in x and x in gleif_1:
        return gleif_1[x]
    return None

In [None]:
portfolio_df = pd.read_csv(f"{os.environ.get('PWD')}/itr-data-pipeline/data/external/example_portfolio_rmi_v3.csv",
                           delimiter=';')
portfolio_df.insert(1, 'company_lei', portfolio_df.company_name.str.upper().map(gleif_match))
portfolio_df.company_id = portfolio_df.company_lei.map(rmi_dict)
portfolio_df = portfolio_df.drop('company_isin', axis=1)
display(portfolio_df[portfolio_df.company_lei.isna()])

In [None]:
portfolio_df = portfolio_df.dropna(how='any').convert_dtypes()
print(len(portfolio_df))
portfolio_df.iloc[0:40]

In [None]:
engine.execute(f"create schema if not exists {ingest_schema}").fetchall()

qres = engine.execute(f"show tables in {ingest_schema}")
l = qres.fetchall()
for x in l:
    qres = engine.execute(f"drop table {ingest_schema}.{x[0]}")
    display(qres.fetchall())

engine.execute(f"drop schema {ingest_schema}").fetchall()
engine.execute(f"create schema {ingest_schema}").fetchall()

In [None]:
buf = io.BytesIO()
portfolio_df.to_parquet(path=buf)
buf.seek(0)
trino_bucket.upload_fileobj(Fileobj=buf,
                            Key=f'trino/{ingest_schema}/portfolio/data.parquet')

columnschema = osc.create_table_schema_pairs(osc.enforce_sql_column_names(portfolio_df))
tabledef = f"""
create table if not exists {ingest_schema}.portfolio (
{columnschema}
) with (
format = 'parquet',
external_location = 's3a://{trino_bucket.name}/trino/{ingest_schema}/portfolio/'
)
"""
print(tabledef)

table_create = engine.execute(tabledef)
for row in table_create.fetchall():
    print(row)

### Company Data

In [None]:
# We have no S3 emissions in RMI data.

engine.execute("select * from sec_dera.sic_isic").fetchall()

### Capture a list of the companies for which we have good financial info

In [None]:
qres = engine.execute(f"""
select F.name, F.lei, T.tname, U2.parent_ticker, F.sic, F.ddate,
       'revenue' as rl, round (F.revenue_usd/1000000.0, 1), round (RT2.fy_revenue_total/1000000.0, 1), round (CS2.fy_revenues/1000000.0, 1), round (F.revenue_usd/RT2.fy_revenue_total, 1), round (F.revenue_usd/CS2.fy_revenues, 1),
       'market_cap' as fl, round (F.market_cap_usd/1000000.0, 1),
       'EV' as el, round ((F.market_cap_usd+F.debt_usd-F.cash_usd)/1000000, 1),
       'assets' as al, round (F.assets_usd/1000000.0, 1), round (AEI2.asset_value/1000000.0, 1), round (F.assets_usd/AEI2.asset_value, 1),
       'cash' as cc, round (F.cash_usd/1000000.0, 1),
       -- 'income' as il, F.income_usd/1000000.0, AEI2.fy_earnings_value/1000000.0, F.income_usd/AEI2.fy_earnings_value,
       'counts: ulei, aei, rt, cs' as legend, c_ulei, c_aei, c_rt, c_cs
from {ingest_schema}.portfolio as P
     join (select count (*) as c_ulei, U.parent_name, U.parent_lei, U.parent_ticker
           from rmi_20211120.utility_information as U
           group by U.parent_name, U.parent_lei, U.parent_ticker) as U2 on U2.parent_lei=P.company_lei
     join sec_dera.financials_by_lei as F on F.lei=P.company_lei
     join (select count (*) as c_cs, CS.parent_name, CS.year, sum(revenues) as fy_revenues
           from rmi_20211120.customers_sales as CS
           group by CS.parent_name, CS.year) as CS2 on CS2.parent_name=U2.parent_name and year(CS2.year)=year(F.ddate)
     join (select count (*) as c_aei, AEI.parent_name, AEI.year, sum(AEI.asset_value) as asset_value, sum(AEI.earnings_value) as fy_earnings_value
           from rmi_20211120.assets_earnings_investments as AEI
           group by AEI.parent_name, AEI.year) as AEI2 on AEI2.parent_name=U2.parent_name and year(AEI2.year)=year(F.ddate)
     left join (select count (*) as c_rt, RT.parent_name, RT.year, sum(RT.revenue_total) as fy_revenue_total
           from rmi_20211120.revenue_by_tech as RT
           group by RT.parent_name, RT.year) as RT2 on RT2.parent_name=U2.parent_name and year(RT2.year)=year(F.ddate)
     left join sec_dera.ticker T on F.cik=T.cik and upper(T.tname)=U2.parent_ticker
where year(F.ddate)=2019
order by F.name
""")

l = qres.fetchall()
print(len(l))
[x for x in l if any(x) is None]

### Print that list...

In [None]:
display(l)

### Capture and print a list of copmanies with both financial info and emissions info

In [None]:
qres = engine.execute(f"""
select DISTINCT 'P.company_name', 'P.company_lei', 'P.company_id', 'S2I.isic',
       'F.country', 'UN.region', 'sector', 'production', 's1s2_co2', 's3_co2', 's1s2_ei', 's3_ei',
       'F.revenue_usd', 'F.market_cap_usd', 'ev', 'F.assets_usd', 'F.cash_usd',
       'target_probability'
""")
l = qres.fetchall()
print(l)

sql = f"""
select DISTINCT P.company_name, P.company_lei, P.company_id, S2I.isic,
       F.country, UN.region_ar6_10 as region,
       'Electricity Utilities' as sector, sum(E.generation) as production, sum(E.emissions_co2 + (265/1000000.0)*coalesce(E.emissions_nox, 0)) as s1s2_co2, NULL as s3_co2,
       sum(E.emissions_co2 + (265/1000000.0)*coalesce(E.emissions_nox, 0)) / sum(E.generation) as s1s2_ei, NULL as s3_ei,
       F.revenue_usd as company_revenue,
       F.market_cap_usd as company_market_cap,
       F.market_cap_usd+F.debt_usd-F.cash_usd as company_enterprise_value,
       F.assets_usd as company_total_assets,
       F.cash_usd as company_cash_equivalents,
       cast(0.5 as real) as target_probability
from {ingest_schema}.portfolio as P
     left join sec_dera.financials_by_lei as F on F.lei=P.company_lei and F.ddate>=DATE('2019-01-01') and F.ddate<DATE('2020-01-01')
     join iso3166.countries as I on F.country=I.alpha_2
     join essd.regions as UN on I.alpha_3=UN.iso
     -- join sec_dera.sub as S on S.cik=F.cik
     left join rmi_20211120.utility_information as U on U.parent_lei=P.company_lei
     -- left join gleif_mdt.gleif_isin_lei G on G.lei=P.lei and G.isin=U.parent_isin
     left join sec_dera.sic_isic as S2I on S2I.sic=F.sic
     left join rmi_20211120.operations_emissions_by_fuel as E on U.respondent_id=E.respondent_id and year(E.year)=year(F.ddate)
-- where E.owned_or_total='owned'
group by P.company_name, P.company_lei, P.company_id, S2I.isic,
       F.country, UN.region_ar6_10, 'Electric Utilties', NULL, NULL,
       F.revenue_usd, F.market_cap_usd, F.market_cap_usd+F.debt_usd-F.cash_usd, F.assets_usd, F.cash_usd,
       cast(0.5 as real)
order by P.company_name
limit 200
"""

qres = engine.execute(sql)
l = qres.fetchall()
print(len(l))
display(l[0:40])

### `financial_df` contains all the base year (2019) financial, production, and emissions data

In [None]:
financial_df = pd.read_sql(sql, engine)
print(len(financial_df))

### Load emissions target data

In [None]:
engine.execute("describe rmi_20211120.emissions_targets").fetchall()

### `targets_df` has all the historical and target emissions data (which can be interpreted to provide trajectory data as well)

We also preserve RMI's 1.5 degree target info, which can be presented as a trajectory to compare/contrast corporate targets with RMI's best policy recommendations

In [None]:
targets_df = pd.read_sql(f"""
select ET.parent_name as company_name, year(year) as year,
       co2_intensity_historical, co2_intensity_target_all_years, co2_intensity_1point5C,
       co2_historical, co2_target_all_years, co2_1point5C,
       generation_historical as production_historical, generation_projected as production_projected, generation_1point5C as production_1point5C
from rmi_20211120.emissions_targets ET
     -- left join (select parent_name, parent_lei from rmi_20211120.utility_information group by parent_name, parent_lei) U
     --      on ET.parent_name=U.parent_name
""", engine) # parse_dates=['year']

targets_df.insert(1, 'company_lei', targets_df.company_name.str.upper().map(gleif_match))
targets_df.insert(2, 'company_id', targets_df.company_lei.map(rmi_dict))

print(f"len(targets_df) = {len(targets_df)}")

In [None]:
targets_df.loc[targets_df.year==2019]

Fix some inconsistencies in the data, such as retrospective target information being null where historical data is available, or where retrospective target is zero and emissions grow to present date, only to shrink again

In [None]:
targets_df.loc[targets_df.year<2020, 'co2_intensity_target_all_years'] = targets_df.loc[targets_df.year<2020, ['co2_intensity_historical', 'co2_intensity_target_all_years', 'co2_intensity_1point5C']].max(skipna=True, axis=1)
targets_df.loc[targets_df.year<2020, 'co2_target_all_years'] = targets_df.loc[targets_df.year<2020, ['co2_historical', 'co2_target_all_years', 'co2_1point5C']].max(skipna=True, axis=1)
targets_df.loc[targets_df.year<2020, 'production_projected'] = targets_df.loc[targets_df.year<2020, ['production_historical', 'production_projected']].max(skipna=True, axis=1)

In [None]:
def compute_sums_and_wavg(x):
    d = { 'co2_target_by_year':x['co2_target_all_years'].sum(),
          'production_by_year':x['production_projected'].sum() }
    if d['production_by_year']:
        d['co2_intensity_target_by_year'] = (x['production_projected'] * x['co2_intensity_target_all_years']).sum() / d['production_by_year']
    else:
        d['co2_intensity_target_by_year'] = np.nan
    return pd.Series(d, index=['co2_intensity_target_by_year', 'co2_target_by_year', 'production_by_year'])

df = (targets_df[targets_df.year>=2014]
      .fillna(method='pad').groupby(['company_name', 'company_lei', 'company_id', 'year'])
      .apply(compute_sums_and_wavg)
      .sort_values(['company_name', 'year'], ascending=[True, False])
     )

In [None]:
for table in ['rmi_company_data', 'rmi_intensity_data', 'rmi_emissions_data', 'rmi_production_data', 'rmi_trajectory_data']:
    qres = engine.execute(f"drop table if exists {ingest_schema}.{table}")
    print(qres.fetchall())
    dres = trino_bucket.objects \
        .filter(Prefix = f'data/{ingest_schema}.db/{table}/') \
        .delete()
    print(dres)

In [None]:
financial_df.to_sql('rmi_company_data', engine, schema=ingest_schema, if_exists='replace', index=False, chunksize=200, method='multi')
print(engine.execute("select * from rmi_company_data limit 10").fetchall())

In [None]:
df.co2_intensity_target_by_year.reset_index().to_sql('rmi_intensity_data', engine, schema=ingest_schema, index=False, chunksize=200, method='multi')
df.co2_target_by_year.reset_index().to_sql('rmi_emissions_data', engine, schema=ingest_schema, index=False, chunksize=200, method='multi')
df.production_by_year.reset_index().to_sql('rmi_production_data', engine, schema=ingest_schema, index=False, chunksize=200, method='multi')

print(engine.execute("select * from rmi_intensity_data limit 10").fetchall())

In [None]:
df.loc[('AES Corp.', '2NUNNB7D43COUIRE5295', 'US00130H1059', 2045)]

In [None]:
xdf = df.reset_index()
# xdf.year = xdf.year.map(lambda x: x.year)
pdf = xdf.pivot(index=['company_name', 'company_lei', 'company_id'], columns='year').reset_index()
# pdf.insert(1, 'company_lei', pdf.company_name.str.upper().map(gleif_match))
# pdf.insert(2, 'company_id', pdf.company_lei.map(rmi_dict))
# pdf = pdf.set_index(['company_name','company_lei', 'company_id'], drop=True)
pdf.columns.names=[None,None]
pdf

In [None]:
co2_ei_df = pd.concat([pdf.company_name, pdf.company_lei, pdf.company_id, pdf.co2_intensity_target_by_year.reset_index()], axis=1).drop('index', axis=1)
co2_ei_df

In [None]:
co2_ei_df.iloc[:, 3] = 2*co2_ei_df.iloc[:, 4] - co2_ei_df.iloc[:, 5]
co2_ei_df = co2_ei_df[co2_ei_df.company_id.notna()]
co2_ei_df.insert(3, 'scope', 'S1+S2')
co2_ei_df.head(10)

In [None]:
qres = engine.execute(f"drop table if exists {ingest_schema}.rmi_trajectory_data")
print(qres.fetchall())
dres = trino_bucket.objects \
        .filter(Prefix = f'data/{ingest_schema}.db/rmi_trajectory_data/') \
        .delete()
print(dres)

historic_progress = (1.0 + co2_ei_df[2019] / co2_ei_df[2014]) / 2

annualized_progress = historic_progress.where(historic_progress>0).where(historic_progress<=1).dropna() ** (1/(2019-2014))

traj_df = co2_ei_df.copy()
traj_df.loc[:, 2021:2049]=np.nan
traj_df[2050] = co2_ei_df[2019] * annualized_progress ** (2050-2020)
traj_df.loc[:, 2020:2050] = traj_df.loc[:, 2020:2050].interpolate(axis=1)
traj_mdf = traj_df.melt(id_vars=['company_name','company_lei','company_id','scope'], var_name='year')
traj_mdf.rename(columns={'value':'co2_intensity_trajectory_by_year'}, inplace=True)
traj_mdf.to_sql('rmi_trajectory_data', engine, schema=ingest_schema, index=False, chunksize=200, method='multi')

In [None]:
co2_df = pd.concat([pdf.company_name, pdf.company_lei, pdf.company_id, pdf.co2_target_by_year.reset_index()], axis=1).drop('index', axis=1)
co2_df = co2_df[co2_df.company_id.notna()]
co2_df.insert(3, 'scope', 'S1+S2')
co2_df.head()

In [None]:
gen_df = pd.concat([pdf.company_name, pdf.company_lei, pdf.company_id, pdf.production_by_year.reset_index()], axis=1).drop('index', axis=1)
gen_df.iloc[:, 3] = 2*gen_df.iloc[:, 4] - gen_df.iloc[:, 5]
gen_df = gen_df[gen_df.company_id.notna()]
gen_df.insert(3, 'production', 'TWh')
gen_df.head()

In [None]:
with pd.ExcelWriter("rmi-20211120-output.xlsx", datetime_format="YYYY") as writer:
    financial_df.to_excel(writer, sheet_name="fundamental_data", index=False)
    co2_ei_df.to_excel(writer, sheet_name="projected_ei_in_Wh", index=False)
    gen_df.to_excel(writer, sheet_name="projected_production", index=False)
    co2_df.to_excel(writer, sheet_name="projected_co2", index=False)


In [None]:
portfolio_zero = portfolio_df.copy()
portfolio_zero.target_probability = 0.0
portfolio_one = portfolio_df.copy()
portfolio_one.target_probability = 1.0

portfolio_df.to_csv("rmi-20211120-portfolio.csv", sep=';', index=False)

In [None]:
financial_df.loc[financial_df.company_id.isin(["US00130H1059", "US0255371017", "US6362744095"])]

In [None]:
ingest_catalog='osc_datacommons_dev'
demo_schema='demo'
for t in ['rmi_company_data', 'rmi_emissions_data', 'rmi_intensity_data', 'rmi_production_data', 'rmi_trajectory_data']:
    if t!='rmi_emission_data':
        df = pd.read_sql(t, engine)
        df = df.convert_dtypes()
        if 'isic' in df.columns:
            df.isic = df.isic.astype('Int16')
        if 's3_co2' in df.columns:
            df.s3_co2 = df.s3_co2.astype('string')
        if 's3_ei' in df.columns:
            df.s3_ei = df.s3_ei.astype('string')
    print(t)
    try:
        osc.drop_unmanaged_table(ingest_catalog, demo_schema, t, engine, trino_bucket)
        osc.drop_unmanaged_data(demo_schema, t, trino_bucket)
        if t=='rmi_emission_data':
            continue
        osc.ingest_unmanaged_parquet(df, demo_schema, t, trino_bucket)
        qres = engine.execute(osc.unmanaged_parquet_tabledef(df, ingest_catalog, demo_schema, t, trino_bucket, typemap={'Int16':'smallint'}))
        print(qres.fetchall())
    except ValueError as e:
        print(df.dtypes)
