# Data Vault Demo

The basic concept of the Data Vault is that when a user authenticates themself, they receive an engine that gives them access to all the data (rows, columns, tables, schema, etc.) for which they are authorized.  Users who can authenticate themselves for multiple roles can use those roles simultaneously.  Data accessed via such engines retains data lineage, so that users can prove they are using authorized data.

The steps of this demo are:

1. Authenticate and acquire an engine
    1. Dev engine sees all
    2. Quant engine can do temp scoring but not see fundamental company info
    3. User engine can use temp scoring but not see cumulative emissions nor overshoot info
2. Construct Vaults for:
    1. Fundamental corporate financial information
    2. Corporate emissions data (base year, historical)
    3. Corporate target data (start year, end year, target start value, target end value)
    4. Sector benchmark data (production, CO2e intensity)
3. Dev Engine: Visualize projected emissions (targets and trajectories)
4. Quant Engine: Using calculated cumulative emmisions values, visualize per-company trajectory and target temperature scores
5. User Engine: Using consensus probability scoring
    1. Publish per-company temperature alignment score
    2. Based on aggregate portfolio information, produce weighting scores to yield overall portfolio alignment score

In [1]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [2]:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev',
    'schema': 'itr_mdt',
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

ingest_schema = 'itr_mdt'

In [3]:
import json
import os
import pandas as pd
from numpy.testing import assert_array_equal
import ITR

# from ITR.portfolio_aggregation import PortfolioAggregationMethod
# from ITR.temperature_score import TemperatureScore
# from ITR.configs import ColumnsConfig, TemperatureScoreConfig
from ITR.data.data_warehouse import DataWarehouse
from ITR.data.vault_providers import VaultCompanyDataProvider, VaultProviderProductionBenchmark, \
    VaultProviderIntensityBenchmark, DataVaultWarehouse
from ITR.interfaces import ICompanyData, EScope, ETimeFrames, PortfolioCompany, IEmissionIntensityBenchmarkScopes, \
    IProductionBenchmarkScopes

ingest_schema = 'itr_mdt'

In [4]:
sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER_USER1'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD_USER1']),
    'http_scheme': 'https'
}
engine_dev = create_engine(sqlstring, connect_args = sqlargs)
print("connecting with engine " + str(engine_dev))
connection_dev = engine_dev.connect()

connecting with engine Engine(trino://os-climate-user1@trino-secure-odh-trino.apps.odh-cl1.apps.os-climate.org:443/)


In [None]:
root = os.path.dirname(os.path.abspath("/opt/app-root/src/ITR/test/inputs"))
benchmark_prod_json = os.path.join(root, "inputs", "json", "benchmark_production_OECM.json")
benchmark_EI_json = os.path.join(root, "inputs", "json", "benchmark_EI_OECM.json")

# load production benchmarks
with open(benchmark_prod_json) as json_file:
    parsed_json = json.load(json_file)
prod_bms = IProductionBenchmarkScopes.parse_obj(parsed_json)
vault_production_bm = VaultProviderProductionBenchmark(engine=engine_dev, benchmark_name="benchmark_prod", production_benchmarks=prod_bms)

# load intensity benchmarks
with open(benchmark_EI_json) as json_file:
    parsed_json = json.load(json_file)
ei_bms = IEmissionIntensityBenchmarkScopes.parse_obj(parsed_json)
vault_EI_bm = VaultProviderIntensityBenchmark(benchmark_name="benchmark_ei", EI_benchmarks=ei_bms)

# load company data
# TODO: ISIC code should read as int, not float
vault_company_data = VaultCompanyDataProvider(ingest_schema, "rmi_company_data")

vault_warehouse = DataVaultWarehouse(vault_company_data, vault_production_bm, vault_EI_bm)

# Show projections for emissions trajectories, production, and emission targets (N0 only)
# Show cumulative emissions (trajectory, target) and budget (N1 can also see)

def test_N1_temp_scores(self):
    # Show cumulative emissions (trajectory, target) and budget (N1 can see)
    # Show overshoot ratios (trajectory, target) (N1 can see)
    # Show trajectory and target temp scores (N2 can also see)
    pass

def test_N2_portfolio(self):
    # Show weighted temp score over portfolio (N2 can see)
    # Different weighting types give different coefficients
    pass

In [None]:
df = pd.read_sql_table(f"rmi_emission_data", engine)

In [None]:
df = df.sort_values(['company_name', 'year']).reset_index(drop=True)

In [None]:
df

In [None]:
df.pivot(index='year', columns='company_name', values='co2_target_by_year').reset_index().iloc[:, [x for x in list(range(0,3)) + list(range(3,90,3))]].plot(x='year', kind='line', figsize=(24,10))