# ITR Data Pipeline

* Global Parameters
* Industry Data (Sector Projections)

* Portfolio Data
* Company Data
* Automization
* Temperature Scoring

## Environment variables and dot-env

The following cell looks for a "dot-env" file in some standard locations,
and loads its contents into `os.environ`.

In [None]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

## S3 and boto3

In [None]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

In [None]:
import boto3
s3 = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ["S3_DEV_ENDPOINT"],
    aws_access_key_id=os.environ["S3_DEV_ACCESS_KEY"],
    aws_secret_access_key=os.environ["S3_DEV_SECRET_KEY"],
)
trino_bucket = attach_s3_bucket("S3_DEV")

## Connecting to Trino with sqlalchemy

In [None]:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

ingest_schema = 'itr_mdt'

In [None]:
import pandas as pd
import io
from osc_ingest_trino import *

### Global Parameters

These parameters are set/selected by the ITR tool.

### Industry Data (Sector Projections)

In [None]:
scenarios = {}
for scenario in ['TPI', 'TPI_below_2', 'OECM']:
    df_dict = pd.read_excel(os.environ.get('PWD')+f"/itr-data-pipeline/data/external/{scenario}_EI_and_production_benchmarks{('','_v2')[scenario=='OECM']}.xlsx", sheet_name=None)
    for projtype in ['projected_production', 'projected_ei_in_Wh']:
        df_dict[projtype]['projection'] = projtype
        df_dict[projtype]['scenario'] = scenario
    scenarios[scenario] = pd.concat (df_dict.values())
df = pd.concat(scenarios, ignore_index=True)
cols = df.columns.tolist()
cols = cols[-2:]+cols[0:-2]
df = df[cols]
df

In [None]:
sector_projections = df.melt(id_vars=cols[0:4], value_vars=cols[4:], var_name='year')
sector_projections

### Portfolio Data

Prepare GLEIF matching data

In [None]:
gleif_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'mtiemann-GLEIF/DERA-matches.csv')
gleif_file.download_file(f'/tmp/dera-gleif.csv')
gleif_df = pd.read_csv(f'/tmp/dera-gleif.csv', header=0, sep=',', dtype=str, engine='c')
gleif_dict = dict(zip(gleif_df.name, gleif_df.LEI))

In [None]:
gleif_1 = { k.split(',')[0].split(' ')[0]:v for k,v in gleif_dict.items() }
gleif_2 = { ' '.join(k.split(',')[0].split(' ')[0:2]):v for k,v in gleif_dict.items() }

def gleif_match(x):
    x = x.split(',')[0]
    x = x.replace('.','')
    if x in gleif_dict:
        return gleif_dict[x]
    x2 = ' '.join(x.split(' ')[0:2])
    if x2 in gleif_2:
        return gleif_2[x2]
    if ' ' not in x and x in gleif_1:
        return gleif_1[x]
    return None

portfolio_df = pd.read_csv(os.environ.get('PWD')+f"/itr-data-pipeline/data/external/example_portfolio_rmi_v2.csv",
                        delimiter=';')
portfolio_df['LEI'] = portfolio_df['company_name'].str.upper().map(gleif_match)
# display(portfolio_df[portfolio_df.LEI.isna()])
portfolio_df = portfolio_df.dropna(how='any').convert_dtypes()
# portfolio_df

In [None]:
portfolio_df

In [None]:
engine.execute(f"create schema if not exists {ingest_schema}").fetchall()

qres = engine.execute(f"show tables in {ingest_schema}")
l = qres.fetchall()
for x in l:
    qres = engine.execute(f"drop table {ingest_schema}.{x[0]}")
    display(qres.fetchall())

engine.execute(f"drop schema {ingest_schema}").fetchall()
engine.execute(f"create schema {ingest_schema}").fetchall()

In [None]:
buf = io.BytesIO()
portfolio_df.to_parquet(path=buf)
buf.seek(0)
trino_bucket.upload_fileobj(Fileobj=buf,
                            Key=f'trino/{ingest_schema}/portfolio/data.parquet')

columnschema = create_table_schema_pairs(enforce_sql_column_names(portfolio_df))
tabledef = f"""
create table if not exists {ingest_schema}.portfolio (
{columnschema}
) with (
format = 'parquet',
external_location = 's3a://{trino_bucket.name}/trino/{ingest_schema}/portfolio/'
)
"""
print(tabledef)

table_create = engine.execute(tabledef)
for row in table_create.fetchall():
    print(row)



### Company Data

In [None]:
# We have no S3 emissions in RMI data.

engine.execute("select * from sec_dera.sic_isic").fetchall()

In [None]:
engine.execute("describe rmi_20211120.revenue_by_tech").fetchall()

In [None]:
engine.execute("describe sec_dera.financials_by_lei").fetchall()

In [None]:
gleif_dict['Midwest Energy'.upper()]

In [None]:
gleif_2['MIDWEST ENERGY']

In [None]:
lei = 'TWSEY0NEDUDCKS27AH81'
qres=engine.execute(f"select count(*) from rmi_20211120.operations_emissions_by_fuel where respondent_id=213")
qres.fetchall()

In [None]:
qres=engine.execute(f"select * from sec_dera.financials_by_lei where lei = 'TWSEY0NEDUDCKS27AH81'")
qres.fetchall()

In [None]:
qres=engine.execute(f"select * from itr_mdt.portfolio where lei = 'TWSEY0NEDUDCKS27AH81'")
qres.fetchall()

In [None]:
qres = engine.execute("""
select AEI.parent_name, U.parent_lei, U.respondent_id, AEI.year, sum(AEI.asset_value) as fy_asset_value, sum(AEI.earnings_value) as fy_earnings_value
from rmi_20211120.assets_earnings_investments as AEI join rmi_20211120.utility_information U on AEI.respondent_id=U.respondent_id
where U.parent_lei='TWSEY0NEDUDCKS27AH81' and AEI.year=DATE('2019-01-01')
group by AEI.parent_name, U.parent_lei, U.respondent_id, AEI.year
""")
l = qres.fetchall()
print(len(l))

In [None]:
l[:10]

In [None]:
qres = engine.execute(f"""
select F.name, F.lei, F.tname, U2.parent_ticker, F.sic,
       'CO2e' as el, EM2.fy_emissions,
       'revenue' as rl, F.revenue_usd/1000000.0, RT2.fy_revenue_total/1000000.0,
       'assets' as al, F.assets_usd/1000000.0, AEI2.fy_asset_value/1000000.0,
       'income' as il, F.income_usd/1000000.0, AEI2.fy_earnings_value/1000000.0,
       'counts: ulei, aei, rt, em' as legend, c_ulei, c_aei, c_rt, c_em
from {ingest_schema}.portfolio as P
     join (select count (*) as c_ulei, U.parent_lei, U.parent_ticker
           from rmi_20211120.utility_information as U
           group by U.parent_lei, U.parent_ticker) as U2 on U2.parent_lei=P.lei
     join sec_dera.financials_by_lei as F on F.lei=P.lei and coalesce(upper(F.tname),U2.parent_ticker,'None')=coalesce(U2.parent_ticker,upper(F.tname),'None')
     join (select count (*) as c_aei, U.parent_lei, AEI.year, sum(AEI.asset_value) as fy_asset_value, sum(AEI.earnings_value) as fy_earnings_value
           from rmi_20211120.assets_earnings_investments as AEI join rmi_20211120.utility_information U on AEI.respondent_id=U.respondent_id
           group by U.parent_lei, AEI.year) as AEI2 on AEI2.parent_lei=P.lei and AEI2.year=F.fy
     join (select count (*) as c_rt, U.parent_lei, RT.year, sum(RT.revenue_total) as fy_revenue_total
           from rmi_20211120.revenue_by_tech as RT join rmi_20211120.utility_information U on RT.respondent_id=U.respondent_id
           group by U.parent_lei, RT.year) as RT2 on RT2.parent_lei=P.lei and RT2.year=F.fy
     join (select count(*) as c_em, U.parent_lei, EM.year, sum(EM.emissions_co2) as fy_emissions
           from rmi_20211120.operations_emissions_by_fuel as EM join rmi_20211120.utility_information U on EM.respondent_id=U.respondent_id
           group by U.parent_lei, EM.year) as EM2 on EM2.parent_lei=P.lei and EM2.year=F.fy
where fy=DATE('2019-01-01')
order by F.name
""")

l = qres.fetchall()
print(len(l))

In [None]:
l

In [None]:
qres = engine.execute("show tables in rmi_20211120")
l = qres.fetchall()
for x in l:
    qres = engine.execute(f"select count(*) from rmi_20211120.{x[0]}")
    display(f"{x[0]}: {qres.fetchall()[0][0]}")
    qres = engine.execute(f"describe rmi_20211120.{x[0]}")
    display(qres.fetchall())