# TPI Benchmark Data Pipeline

The Benchmark data pipelines organize and assemble benchmark data needed for the ITR tool.  This pipeline supports several TPI Benchmark scenarios (published 20 October 2022).


### Environment variables and dot-env

The following cell looks for a "dot-env" file in some standard locations,
and loads its contents into `os.environ`.

In [None]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib
import numpy as np
import pandas as pd
import trino
from sqlalchemy.engine import create_engine
import osc_ingest_trino as osc

# import python_pachyderm

Define Environment and Execution Variables

In [None]:
# Load environment variables from credentials.env
osc.load_credentials_dotenv()

In [None]:
import io
import json
from math import log10
import itertools

In [None]:
# See data-platform-demo/pint-demo.ipynb for quantify/dequantify functions

import warnings  # needed until quantile behaves better with Pint quantities in arrays
from pint import set_application_registry, Quantity
from pint_pandas import PintArray, PintType
from openscm_units import unit_registry
from common_units import ureg

Q_ = ureg.Quantity
PA_ = PintArray

### S3 and boto3

### Connecting to Trino with sqlalchemy

In the context of the Data Vault, this pipeline operates with full visibiilty into all the data it prepares for the ITR tool.  When the data is output, it is labeled so that the Data Vault can enforce its data management access rules.

In [None]:
ingest_catalog = "osc_datacommons_dev"
ingest_schema = "sandbox"
dera_schema = "sandbox"
dera_prefix = "dera_"
gleif_schema = "sandbox"
rmi_schema = "sandbox"
iso3166_schema = "sandbox"
essd_schema = "sandbox"
essd_prefix = "essd_"
demo_schema = "demo_dv"

# engine = osc.attach_trino_engine(verbose=True, catalog=ingest_catalog)

In [None]:
# TPI Benchmark arrives in DataFrame-ready format.  Read the CSV file and then we'll tidy it up

benchmark_TPI_dir = os.path.abspath("../data/external/TPI 20221022")

csv_df = pd.read_csv(pathlib.Path(benchmark_TPI_dir, "Sector_Benchmarks_20102022.csv"))
csv_df["Release date"] = pd.to_datetime(csv_df["Release date"], dayfirst=True)

In [None]:
bm_dict = {}
for scenario_name in csv_df["Scenario name"].unique():
    # Until we know the temperature targets of the pledges, don't deal with those as benchmarks per se
    if "Pledges" in scenario_name:
        continue
    if scenario_name == "1.5 Degrees":
        benchmark_temperature = 1.5
        benchmark_global_budget = 396  # 66% probability; 500 Gt 50% probability
    elif scenario_name == "Below 2 Degrees":
        benchmark_temperature = 1.65  # 66% probability
        benchmark_global_budget = 646
    else:
        benchmark_temperature = 2.0
        benchmark_global_budget = 1229  # starting from 1.5 @ 66% prob, plus 0.5C at 0.0006 tcre
    df = csv_df[csv_df["Scenario name"].eq(scenario_name)]
    idx = df.groupby(["Sector name", "Region"])["Release date"].transform("max") == df["Release date"]
    df = df.loc[idx].copy()
    df["benchmark_temperature"] = benchmark_temperature
    df["benchmark_global_budget"] = benchmark_global_budget
    df.Unit = (
        df.Unit.str.replace("Carbon intensity ", "")
        .str.replace("Emissions intensity ", "")
        .str.replace("metric tonnes of", "t")
        .str.replace("CO2e", "CO2")
        .str.replace("gCO2", "g CO2")
        .str.replace("tonnes of", "t")
        .str.replace("t-km", "tkm")
        .str.replace("RTK", "tkm")
        .str.replace("/ t aluminium", "/(t Aluminum)")
        .str.replace(" per tonne of cementitious product", "/(t Cement)")
        .str.replace("tonne copper equivalent", "(t Copper)")
        .str.replace(" per tonne of steel", "/(t Steel)")
        .str.replace(" per MWh electricity generation", "/MWh")
        .str.replace(" per tonne of pulp, paper and paperboard", "/(t Paper)")
        .str.replace("tonne ", "t ")
        .str.replace("tCO2", "t CO2")
        .map(lambda x: x[1:-1])
    )
    df.Region = df.Region.str.replace("North-America", "North America")
    bm_dict[scenario_name] = df
print(bm_dict.keys())
display(bm_dict["2 Degrees"])

In [None]:
df = bm_dict["1.5 Degrees"]

df[["Sector name", "Region", "Unit", "2019", "2030", "2050"]]

### Construct JSON benchmark structures

0.  TPI proivdes annual benchmark values so no need to interpolate
1.  TPI defines region-speciifc benchmarks for Electricity Utilities, all others Global
2.  Different sectors have different scopes for benchmarks (S1, S1S2, S1S2S3)
3.  Only emit the latest version of the benchmark
4.  There are several potential global carbon budgets:
    a.  50/50 chance of 1.5C
    b.  66% chance of 1.5C
    c.  Below 2 degrees == 1.65C
    d.  2 degrees (Shift-improve, High-efficiency, Default)

In [None]:
# https://til.simonwillison.net/python/json-floating-point
# Modified to blend the concept of "precision after the decimal point" with "significant figures" (SF).
# For numbers in (-1,1), gives PRECISION=3 sig figs.  For numbers outside that range, but within (-10,10), an addition SF.
# Will provide up to PRECISION-1 additional SFs (default 2) for larger absolute magnitudes.


# from math import log10
def round_floats(o, precision=3):
    if isinstance(o, float):
        if o == 0 or np.isnan(o):
            return 0
        lo = int(log10(abs(o))) - (abs(o) > 10)
        if precision + lo < 0:
            return 0
        if precision * 2 < lo:
            return round(o)
        return round(o, precision - lo)
    if isinstance(o, dict):
        return {k: round_floats(v, precision) for k, v in o.items()}
    if isinstance(o, (list, tuple)):
        return [round_floats(x, precision) for x in o]
    if isinstance(o, pd.Timestamp):
        dt, hms = str(o).split(" ")
        if hms == "00:00:00":
            return dt
        return str(o)
    return o

In [None]:
ei_sectors_scope = {
    "Electricity Utilities": "S1",
    "Oil & Gas": "S1S2S3",
    "Autos": "S3",
    "Airlines": "S1",
    "Shipping": "S1",
    "Cement": "S1",
    "Diversified Mining": "S1S2S3",
    "Steel": "S1S2",
    "Aluminum": "S1S2",
    "Aluminium": "S1S2",
    "Paper": "S1S2",
}

In [None]:
ei_bms = {}

for scenario_name, df in bm_dict.items():
    try:
        ei_bms[scenario_name] = {
            "benchmark_temperature": f"{df.iloc[0].benchmark_temperature} delta_degC",
            "benchmark_global_budget": f"{df.iloc[0].benchmark_global_budget} Gt CO2",
            "is_AFOLU_included": False,
        }
    except IndexError:
        print(df)
        print(scenario_name)

    for scope in ["S1", "S1S2", "S1S2S3", "S3"]:
        bm_scope = {
            "benchmarks": [
                {
                    "sector": row["Sector name"],
                    "region": row["Region"],
                    "benchmark_metric": row["Unit"],
                    "scenario name": f"TPI {scenario_name}",
                    "release date": str(row["Release date"]).split(" ")[0],
                    "projections_nounits": [{"year": year, "value": row[str(year)]} for year in range(2019, 2051)],
                }
                for index, row in df.iterrows()
                if ei_sectors_scope[row["Sector name"]] == scope
            ]
        }
        if len(bm_scope["benchmarks"]):
            ei_bms[scenario_name][scope] = bm_scope

### Emit Sector Benchmark Data

In [None]:
output_datadir = os.path.abspath("../data/processed/TPI 20220504")
pathlib.Path(output_datadir).mkdir(parents=True, exist_ok=True)

In [None]:
for scenario_name, bm in ei_bms.items():
    path_name = scenario_name.translate(str.maketrans(" .-", "___", "()")).lower()
    with open(pathlib.Path(output_datadir, f"benchmark_EI_TPI_{path_name}.json"), "w") as f:
        json.dump(round_floats(bm), sort_keys=False, indent=2, fp=f)
        print("", file=f)