# ITR Data Pipeline

* Global Parameters
* Industry Data (Sector Projections)

* Portfolio Data
* Company Data
* Automization
* Temperature Scoring

## Environment variables and dot-env

The following cell looks for a "dot-env" file in some standard locations,
and loads its contents into `os.environ`.

In [1]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

## S3 and boto3

In [43]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

In [2]:
import boto3
s3 = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ["S3_DEV_ENDPOINT"],
    aws_access_key_id=os.environ["S3_DEV_ACCESS_KEY"],
    aws_secret_access_key=os.environ["S3_DEV_SECRET_KEY"],
)
trino_bucket = s3.Bucket(os.environ["S3_DEV_BUCKET"])

## Connecting to Trino with sqlalchemy

In [4]:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

In [5]:
import pandas as pd

### Global Parameters

These parameters are set/selected by the ITR tool.

### Industry Data (Sector Projections)

In [34]:
scenarios = {}
for scenario in ['TPI', 'TPI_below_2', 'OECM']:
    df_dict = pd.read_excel(os.environ.get('PWD')+f"/itr-data-pipeline/data/external/{scenario}_EI_and_production_benchmarks{('','_v2')[scenario=='OECM']}.xlsx", sheet_name=None)
    for projtype in ['projected_production', 'projected_ei_in_Wh']:
        df_dict[projtype]['projection'] = projtype
        df_dict[projtype]['scenario'] = scenario
    scenarios[scenario] = pd.concat (df_dict.values())
df = pd.concat(scenarios, ignore_index=True)
cols = df.columns.tolist()
cols = cols[-2:]+cols[0:-2]
df = df[cols]
df

Unnamed: 0,projection,scenario,region,sector,2019,2020,2021,2022,2023,2024,...,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,projected_ei_in_Wh,TPI,Global,Steel,0.60756,0.457,0.4376,0.4182,0.3988,0.3794,...,0.0888,0.0806,0.0724,0.0642,0.056,0.0528,0.0496,0.0464,0.0432,0.04
1,projected_ei_in_Wh,TPI,Global,Electricity Utilities,1.669,1.498,1.4718,1.4456,1.4194,1.3932,...,0.7858,0.7586,0.7314,0.7042,0.677,0.6658,0.6546,0.6434,0.6322,0.621
2,projected_production,TPI,Global,Steel,0.0,0.015,0.015,0.015,0.015,0.015,...,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015
3,projected_production,TPI,Europe,Steel,0.0,0.015,0.015,0.015,0.015,0.015,...,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015
4,projected_production,TPI,North America,Steel,0.0,0.015,0.015,0.015,0.015,0.015,...,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015,0.015
5,projected_production,TPI,Global,Electricity Utilities,0.0,-0.076399,0.059958,0.059958,0.059958,0.059958,...,0.025848,0.025848,0.025848,0.025848,0.025848,0.011913,0.011913,0.011913,0.011913,0.011913
6,projected_production,TPI,Europe,Electricity Utilities,0.0,-0.076445,0.025801,0.025801,0.025801,0.025801,...,0.012046,0.012046,0.012046,0.012046,0.012046,0.00636,0.00636,0.00636,0.00636,0.00636
7,projected_production,TPI,North America,Electricity Utilities,0.0,-0.07581,0.019813,0.019813,0.019813,0.019813,...,0.014293,0.014293,0.014293,0.014293,0.014293,0.003227,0.003227,0.003227,0.003227,0.003227
8,projected_ei_in_Wh,TPI_below_2,Global,Steel,0.60756,0.44,0.418,0.396,0.374,0.352,...,0.0616,0.0512,0.0408,0.0304,0.02,0.0144,0.0088,0.0032,-0.0024,-0.008
9,projected_ei_in_Wh,TPI_below_2,Global,Electricity Utilities,1.669,1.325,1.2692,1.2134,1.1576,1.1018,...,0.4566,0.4362,0.4158,0.3954,0.375,0.3526,0.3302,0.3078,0.2854,0.263


In [37]:
sector_projections = df.melt(id_vars=cols[0:4], value_vars=cols[4:], var_name='year')
sector_projections

Unnamed: 0,projection,scenario,region,sector,year,value
0,projected_ei_in_Wh,TPI,Global,Steel,2019,0.607560
1,projected_ei_in_Wh,TPI,Global,Electricity Utilities,2019,1.669000
2,projected_production,TPI,Global,Steel,2019,0.000000
3,projected_production,TPI,Europe,Steel,2019,0.000000
4,projected_production,TPI,North America,Steel,2019,0.000000
...,...,...,...,...,...,...
891,projected_production,OECM,Europe,Steel,2050,0.015000
892,projected_production,OECM,North America,Steel,2050,0.015000
893,projected_production,OECM,Global,Electricity Utilities,2050,0.011913
894,projected_production,OECM,Europe,Electricity Utilities,2050,0.006360


### Portfolio Data

Prepare GLEIF matching data

In [44]:
gleif_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'mtiemann-GLEIF/DERA-matches.csv')
gleif_file.download_file(f'/tmp/dera-gleif.csv')
gleif_df = pd.read_csv(f'/tmp/dera-gleif.csv', header=0, sep=',', dtype=str, engine='c')
gleif_dict = dict(zip(gleif_df.name, gleif_df.LEI))

In [79]:
gleif_1 = { k.split(' ')[0]:v for k,v in gleif_dict.items() }
gleif_2 = { ' '.join(k.split(' ')[0:2]):v for k,v in gleif_dict.items() }

def gleif_match(x):
    x = x.replace(', ', ' ')
    if x in gleif_dict:
        return gleif_dict[x]
    x2 = ' '.join(x.split(' ')[0:2])
    if x2 in gleif_2:
        return gleif_2[x2]
    x1 = x.split(' ')[0]
    if x1 in gleif_1:
        return gleif_1[x1]
    return None

portfolio_df = pd.read_csv(os.environ.get('PWD')+f"/itr-data-pipeline/data/external/example_portfolio_rmi_v2.csv",
                        delimiter=';')
portfolio_df['LEI'] = portfolio_df['company_name'].str.upper().map(gleif_match)
# display(portfolio_df[portfolio_df.LEI.isna()])
portfolio_df = portfolio_df.dropna(how='any')
# portfolio_df

### Company Data

In [None]:
# We have no S3 emissions in RMI data.

