# OECM Benchmark Data Pipeline

The Benchmark data pipelines organize and assemble benchmark data needed for the ITR tool.  This pipeline supports the OECM Benchmark version 2 (published 4 May 2022).


### Environment variables and dot-env

The following cell looks for a "dot-env" file in some standard locations,
and loads its contents into `os.environ`.

In [1]:
import os
import pathlib
import json

import numpy as np
import pandas as pd
from math import log10

# import trino
# import boto3
# from sqlalchemy.engine import create_engine
# import osc_ingest_trino as osc
# import python_pachyderm

# See data-platform-demo/pint-demo.ipynb for quantify/dequantify functions
import warnings  # needed until quantile behaves better with Pint quantities in arrays
from pint import Context
from pint_pandas import PintArray
from common_units import ureg

Q_ = ureg.Quantity
PA_ = PintArray

Initializing common units...


### If we are connecting to the Data Commons, we need credentials and other initializations

### Definitions and dictionaries for reading from / writing to the outside world

In [2]:
transport_elements = [
    "Subsector",
    "Total CO2 Emissions",
    "Emission Intensity",
    "Energy Intensity",
]
bldgs_elements = [
    "Parameter",
    "Residential Buildings",
    "Commercial Buildings",
    "Construction: Residential and Commercial Building - Economic value",
]

benchmark_years = pd.Series(
    name="Production", index=pd.Index(list(range(2019, 2051))), dtype="float64"
)
benchmark_years.index.name = "Year"

# Maps Sector (really Sub-Sector) to Sheet data
oecm_dict = {
    # Subsector: Parameter / Subsector tag; Sheet; Aggregates as; Aggregates to; CO2 label; Production Units; Intensity Units
    "Materials / Steel": [
        "Parameter",
        "Steel",
        "Materials / Steel",
        "Annual production volume- Iron & Steel Industry",
        "Total CO2 equivalent",
        "Mt Steel",
        "t CO2e/(t Steel)",
    ],
    "Power Utilities": [
        "Subsector",
        "Utilities",
        "Power Utilities",
        "Total public power generation (incl. CHP, excluding auto producers, losses)",
        "Total CO2 equivalent",
        "TWh",
        "t CO2e/MWh",
    ],
    "Gas Utilities": [
        "Subsector",
        "Utilities",
        "Gas Utilities",
        "Total Energy transport & distribution (gas, synthetic fuels & hydrogen)",
        "Total CO2 equivalent",
        "PJ",
        "t CO2e/GJ",
    ],
    "Utilities": [
        "Subsector",
        "Utilities",
        "Utilities",
        "Total Energy Production (power + gas/fuels)",
        "Total CO2 equivalent",
        "PJ",
        "t CO2e/GJ",
    ],
    "Coal": [
        "Subsector",
        "Energy",
        "Coal",
        "Coal: Gross Production (for regional energy demand - incl. non-energy-use)",
        "",
        "Mt Coal",
        "t CO2e/(t Coal)",
    ],
    # Note we have to convert from /d to /a
    "Oil": [
        "Subsector",
        "Energy",
        "Oil",
        "Oil: Gross Production (for regional energy demand - incl. non-energy-use)",
        "",
        "MMbbl/d",
        "t CO2e/Mbbl",
    ],
    "Gas": [
        "Subsector",
        "Energy",
        "Gas",
        "Gas: Gross Production (for regional energy demand - incl. non-energy-use)",
        "",
        "bcm CH4",
        "Mt CO2e/(bcm CH4)",
    ],
    "Energy Industry": [
        "Subsector",
        "Energy",
        "Energy Industry",
        "Total Energy Production - Energy, Gas, Oil &Coal Sector",
        "Total CO2 equivalent",
        "PJ",
        "t CO2e/GJ",
    ],
    "Road: LDV / Passenger Transport": [
        "Subsector",
        "Transport_UNPRI",
        "Road Transport",
        "Road Transport (excluding vehicle manufacturing)",
        "",
        "pkm",
        "g CO2e/pkm",
    ],
    "Road: Trucks / Freight Transport": [
        "Subsector",
        "Transport_UNPRI",
        "Road Transport",
        "Road Transport (excluding vehicle manufacturing)",
        "",
        "tkm",
        "g CO2e/tkm",
    ],
    "Aluminium Industry": [
        "Parameter",
        "Alu",
        "Aluminium Industry",
        "Annual production volume- aluminium Industry",
        "Total CO2 equivalent",
        "Mt Aluminum",
        "t CO2e/(t Aluminum)",
    ],
    "Materials / Cement": [
        "Parameter",
        "Cement",
        "Materials / Cement",
        "Cement - production volume in mega tonnes per year",
        "Total CO2 equivalent",
        "Mt Cement",
        "t CO2e/(t Cement)",
    ],
    "Construction Buildings": [
        "Parameter",
        "Buildings",
        "Construction Buildings",
        "Construction: Residential and Commercial Building - Economic value",
        "Total CO2 equivalent",
        "billion USD",
        "t CO2e/(million USD)",
    ],
    "Residential Buildings": [
        "Parameter",
        "Buildings",
        "Residential Buildings",
        "Residential Buildings",
        "Total CO2 equivalent",
        "billion m**2",
        "t CO2e/(million m**2)",
    ],
    "Commercial Buildings": [
        "Parameter",
        "Buildings",
        "Commercial Buildings",
        "Commercial Buildings",
        "Total CO2 equivalent",
        "billion m**2",
        "t CO2e/(million m**2)",
    ],
    "Chemical Industry": [
        "Parameter",
        "Chemical Industry",
        "Chemical Industry",
        "Total Chemical Industry",
        "Total CO2 equivalent",
        "billion USD",
        "kg CO2e/USD",
    ],
    "Pharmaceutical Industry": [
        "Parameter",
        "Chemical Industry",
        "Pharmaceutical Industry   -",
        "Pharmaceutical Industry - Economic value",
        "",
        "billion USD",
        "kg CO2e/USD",
    ],
    "Agricultural Chemicals": [
        "Parameter",
        "Chemical Industry",
        "Agricultural Chemicals   -",
        "Agricultural Chemicals - Economic value",
        "",
        "billion USD",
        "kg CO2e/USD",
    ],
    "Inorganic Chemicals and Consumer Products": [
        "Parameter",
        "Chemical Industry",
        "Inorganic Chemicals and Consumer Products   -",
        "Inorganic Chemicals and Consumer Products - Economic value",
        "",
        "billion USD",
        "kg CO2e/USD",
    ],
    "Manufactured Fibres & Synthetic Rubber": [
        "Parameter",
        "Chemical Industry",
        "Manufactured Fibres & Synthetic Rubber   -",
        "Manufactured Fibres & Synthetic Rubber - Economic value",
        "",
        "billion USD",
        "kg CO2e/USD",
    ],
    "Bulk Petrochemicals & Intermediates, Plastic Resins": [
        "Parameter",
        "Chemical Industry",
        "Bulk Petrochemicals & Intermediates, Plastic Resins   -",
        "Bulk Petrochemicals & Intermediates, Plastic Resins - Economic value",
        "",
        "billion USD",
        "kg CO2e/USD",
    ],
    "Textile & Leather": [
        "Parameter",
        "Tex & Lea",
        "Textile & Leather",
        "Total Textile & Leather",
        "Total CO2 equivalent",
        "billion USD",
        "kg CO2e/USD",
    ],
}

# From OECM (Sub-)Sector name to ITR Sector Name.  Keys MUST BE UNIQUE
itr_dict = {
    "Materials / Steel": "Steel",
    "Power Utilities": "Electricity Utilities",
    "Gas Utilities": "Gas Utilities",
    "Utilities": "Utilities",
    "Coal": "Coal",
    "Oil": "Oil",
    "Gas": "Gas",
    "Energy Industry": "Energy",
    "Road: LDV / Passenger Transport": "Autos",
    "Road: Trucks / Freight Transport": "Trucking",
    "Aluminium Industry": "Aluminum",
    "Materials / Cement": "Cement",
    "Construction Buildings": "Construction Buildings",
    "Residential Buildings": "Residential Buildings",
    "Commercial Buildings": "Commercial Buildings",
    "Chemical Industry": "Chemicals",
    "Pharmaceutical Industry": "Pharmaceuticals",
    "Agricultural Chemicals": "Ag Chem",
    "Inorganic Chemicals and Consumer Products": "Consumer Products",
    "Manufactured Fibres & Synthetic Rubber": "Fiber & Rubber",
    "Bulk Petrochemicals & Intermediates, Plastic Resins": "Petrochem & Plastics",
    "Textile & Leather": "Textiles",
}

### Interpolation Function

Production is CAGR-based; Emissions are CAGR-based if the ratio fo start/finish <= 2.

When start/finish gets too high, the curve gets a pronounced drop in the first year

When finish is zero, the curve can only approach is asymptotically, which is also problematic.
Instead, use linear interpolation when it's time to drive the curve down to zero

In [3]:
# Interpolate missing benchmark values for Production and Emissions, then compute Emissions Intensities (EI)


def interpolate_benchmark(df, ei_unit, first_year=2019, last_year=2050):
    # Interpolate all missing Production and Scope emissions, except Scope 3 remains zero until we change benchmarks

    i = first_year
    while i < last_year:
        idx1 = i  # .Production.first_valid_index()
        idx2 = df[df.index > i].Production.first_valid_index()
        if idx2 is None:
            break

        nth_root = 1 / (idx2 - idx1)
        if production_centric:
            columns = ["Production", "S1", "S2", "S1S2"]
        else:
            columns = ["Production", "S1", "S2", "S1S2", "S3", "S1S2S3"]
        for col in columns:
            if df.loc[idx2, col] == 0 or (df.loc[idx1, col] / df.loc[idx2, col]).m > 2:
                # print(f"Linear: {df.loc[idx1, col].m}/{df.loc[idx2, col].m}")
                # Linear interpolation
                delta = (df.loc[idx2, col] - df.loc[idx1, col]) / (idx2 - idx1)
                for j in range(idx1, idx2):
                    df.loc[j + 1, col] = df.loc[j, col] + delta
            else:
                if df.loc[idx1, col].m == 0 and df.loc[idx2, col].m != 0.0:
                    assert False
                # print(f"CAGR: {df.loc[idx1, col].m}/{df.loc[idx2, col].m}")
                # CAGR interpolation
                multiplier = ((df.loc[idx2, col] / df.loc[idx1, col]) ** nth_root).m
                for j in range(idx1, idx2):
                    df.loc[j + 1, col] = df.loc[j, col] * multiplier
        i = idx2
    df["EI_S1"] = (df.S1 / df.Production).astype(f"pint[{ei_unit}]")
    df["EI_S2"] = (df.S2 / df.Production).astype(f"pint[{ei_unit}]")
    df["EI_S1S2"] = (df.S1S2 / df.Production).astype(f"pint[{ei_unit}]")
    if not production_centric:
        df["EI_S3"] = (df.S3 / df.Production).astype(f"pint[{ei_unit}]")
        df["EI_S1S2S3"] = (df.S1S2S3 / df.Production).astype(f"pint[{ei_unit}]")

    # By convention, the d_ column is zero at the start of the series.
    # Subsequent values multiply the previous quantity by the present d_ number to get the present quanity
    df["d_Production"] = [0] + [
        yoy.m - 1
        for yoy in (df.Production.values[1:] / df.Production.values[:-1]).tolist()
    ]

    # When production goes to zero (a net-zero goal!) treat 0/0 as 0, not np.inf
    df_normalized = df.apply(
        lambda col: (
            PA_(col.pint.m.replace([np.nan, np.inf], 0.0), dtype=col.dtype)
            if isinstance(col.values, PintArray)
            else col
        )
    )
    return df_normalized

### Principle processing function

Start with dataframe containing "messy" data from Spreadsheet, then clean it up to a standard format

In [4]:
energy_subsectors = ["Coal", "Oil", "Gas"]
chemical_subsectors = [
    "Pharmaceutical Industry",
    "Agricultural Chemicals",
    "Inorganic Chemicals and Consumer Products",
    "Manufactured Fibres & Synthetic Rubber",
    "Bulk Petrochemicals & Intermediates, Plastic Resins",
]


def process_sector_benchmark(
    sector_benchmark, subsector, region, sector_elements, production_centric=True
):
    s = sector_benchmark.iloc[:, 1]
    sheet = sector_elements[1]
    sector = sector_elements[2]
    # Transport_UNPRI doesn't have 'Total CO2 equivalent' in its scope strings...
    # Energy is both lacking a '-' separator and has various elaborations of scope categories
    if sheet == "Energy" and subsector in energy_subsectors:
        pass
    elif sheet == "Chemical Industry" and subsector in chemical_subsectors:
        pass
    else:
        df_elements = [
            sector_elements[0],
            sector_elements[3],
            " ".join([f"{sector} - Scope 1:", sector_elements[4]]).rstrip(),
            " ".join([f"{sector} - Scope 2:", sector_elements[4]]).rstrip(),
            " ".join([f"{sector} - Scope 3:", sector_elements[4]]).rstrip(),
        ]

    # Hand-adjust the rows and columns we'll be processing.  A few sectors are unique in their shape/data.
    # Some sheets have extra years of data, which pushes 2050 to the right.  We allocate a generous number
    # of columns so that we capture 2050, and then we drop the columns we don't need, either from middle or the right
    if sheet == "Chemical Industry":
        if subsector in chemical_subsectors:
            df_elements = [
                sector_elements[0],
                (
                    "Specialties, Inorganic Chemicals, Consumer Products - Economic value"
                    if sector_elements[3].startswith(
                        "Inorganic Chemicals and Consumer Products"
                    )
                    else sector_elements[3]
                ),
                f'{subsector}   - Scope 1"',
                f'{subsector}   - Scope 2"',
                f'{subsector}   - Scope 3"',  # Alas we presently have no per-subsector Scope 3 data, so this is always NULL
            ]

            df = sector_benchmark.iloc[
                s.loc[s.isin(df_elements).fillna(False)].index, 1:11
            ]
            ghg_s3 = sector_benchmark.iloc[
                s.loc[s.eq("Chemical Industry total non-energy GHG")].index, 1:11
            ].squeeze()
            # Evenly distribute refrigeration among all four sub-sectors
            df.iloc[4, 3:] = ghg_s3.iloc[3:].astype("float") / 4.0
            if production_centric:
                df.iloc[2, 3:] = df.iloc[[2, 4], 3:].astype("float").sum()
                df.iloc[4, 3:] = 0.0
            df.iloc[:, 1] = df.iloc[:, 1].replace("million t ", "Mt ", regex=True)
        else:
            df = sector_benchmark.iloc[
                s.loc[s.isin(df_elements).fillna(False)].index, 1:14
            ][
                [True] * 2
                + [False] * 3
                + [not production_centric] * 3
                + [production_centric] * 3
            ]
    elif sheet == "Utilities" and subsector == "Power Utilities":
        # In both S3 and Production-Centric cases, we use Production-Centric data for Power Utilities
        df = sector_benchmark.iloc[
            s.loc[s.isin(df_elements).fillna(False)].index, 1:14
        ][[True] * 2 + [False] * 3 + [True] * 3]
    elif sheet == "Tex & Lea":
        df = sector_benchmark.iloc[
            s.loc[s.isin(df_elements).fillna(False)].index, 1:14
        ][
            [True] * 2
            + [False]
            + [not production_centric] * 3
            + [production_centric] * 3
        ]
    elif sheet == "Buildings":
        # Note this is built from `subsector` not `sector`
        df_elements = [
            sector_elements[0],
            sector_elements[3],
            " ".join([f"{subsector} - Scope 1:", sector_elements[4]]).rstrip(),
            " ".join([f"{subsector} - Scope 2:", sector_elements[4]]).rstrip(),
            " ".join([f"{subsector} - Scope 3:", sector_elements[4]]).rstrip(),
        ]
        # We create our own benchmark data from piece-parts
        df = sector_benchmark.iloc[
            s.loc[s.isin(df_elements).fillna(False)].index, 1:14
        ][[True] * (1 + ("Construction" not in subsector)) + [True] * 3]
        # Need to create Scope 3 for Building Construction
        if "Construction" in subsector:
            scope2_label = df.iloc[-1, 0]
            scope3_label = scope2_label.replace("Scope 2", "Scope 3")
            scope3_row = pd.Series(
                [scope3_label, df.iloc[-1, 1], df.iloc[-1, 2]]
                + [0.0] * len(df.iloc[-1, 3:]),
                index=df.columns,
                name=str(int(df.iloc[-1].name) + 2),
            )
            df = pd.concat([df, scope3_row.to_frame().T], axis=0, ignore_index=True)
    elif sheet == "Energy" and subsector in energy_subsectors:
        df_elements = [
            sector_elements[0],
            sector_elements[3],
            f"{sector} Scope 1:",
            f"{sector} Scope 2: Electricity - own sector use",
            f"{sector} Scope 3: Total CO2 equivalent",
        ]
        df = sector_benchmark.iloc[
            s.loc[s.isin(df_elements).fillna(False)].index, 1:14
        ][
            [True] * 2
            + [not production_centric] * 3
            + [False] * 3
            + [production_centric] * 3
            + [False] * 3
        ]
    else:
        df = sector_benchmark.iloc[
            s.loc[s.isin(df_elements).fillna(False)].index, 1:14
        ][[True] * 2 + [not production_centric] * 3 + [production_centric] * 3]
    while df.iloc[0, -1] != "2050":
        df = df.drop(columns=df.columns[-1])

    # Column 'D' is either empty or contains notes to self...drop in either case
    df = df.drop(columns=df.columns[2])
    # Drop empty columns and transpose so that years are in rows
    df = df.dropna(how="all", axis=1).T

    # Now ready to build the DataFrame...
    df.columns = ["Year", "Production", "S1", "S2", "S3"]
    df.S3 = df.S3.fillna(0)
    units = df.iloc[1, 1:].map(
        lambda x: x[1:-1].split("/")[0].replace("Mt CO2 equiv.", "Mt CO2e"),
        na_action="ignore",
    )
    units.replace("bn $ GDP", "billion USD")
    units.Production = sector_elements[5]
    df = (
        df.iloc[2:]
        .astype(
            {
                "Year": "int",
                "Production": "float",
                "S1": "float",
                "S2": "float",
                "S3": "float",
            }
        )
        .set_index("Year")
    )

    # Note that we have three main transport types: Aviation, Shipping, Road, and two main carriage types: Passenger and Freight
    # For now, we just handled Road Transport
    if sheet == "Transport_UNPRI":
        if not production_centric:
            # Scope 3 emissions units wrongly entered as '0' rather than [Mt CO2e]
            units.iloc[-1] = units.iloc[-2]
        # Need to proportionalize total sector emissions vs. passenger-only and then feed back into total
        s = pd.concat([sector_benchmark.iloc[:8, 1], sector_benchmark.iloc[87:, 1]])
        road = sector_benchmark.iloc[
            s.loc[s.isin(transport_elements).fillna(False)].index, 1:14
        ]
        while road.iloc[0, -1] != "2050":
            road = road.drop(columns=road.columns[-1])
        if subsector == "Road: LDV / Passenger Transport":
            road = road.dropna(how="all", axis=1)[1:4].T
        else:
            road = road.dropna(how="all", axis=1)[4:7].T
        road.columns = road.iloc[0]
        road_units = road.iloc[1].map(
            lambda x: x[1:-1].split("/")[0].replace("Mt CO2 equiv.", "Mt CO2e"),
            na_action="ignore",
        )
        road_km = "pkm" if subsector == "Road: LDV / Passenger Transport" else "tkm"
        for unit in road_units.index:
            if "Intensity" in unit:
                road_units[unit] = f"{road_units[unit]} / {road_km}"
        units.Production = (
            (
                ureg(road_units["Total CO2 Emissions"])
                / ureg(road_units["Emission Intensity"])
            )
            .to(f"giga{road_km}")
            .u
        )
        road = road.iloc[2:].astype("float64")
        road.index = df.index
        # Slice out old data columns so that everything starts at 2019
        df = df.drop([2017, 2018], errors="ignore")
        road = road.drop([2017, 2018], errors="ignore")
        df = pd.concat([df, road], axis=1)
        with warnings.catch_warnings():
            # pd.DataFrame.__init__ (in pandas/core/frame.py) ignores the beautiful dtype information adorning the pd.Series list elements we are providing.  Sad!
            warnings.simplefilter("ignore")
            df.Production = df.apply(
                lambda x: (
                    Q_(x["Total CO2 Emissions"], road_units["Total CO2 Emissions"])
                    / Q_(x["Emission Intensity"], road_units["Emission Intensity"])
                    if x["Emission Intensity"]
                    else np.nan
                ),
                axis=1,
            ).fillna(method="ffill")
        scopes = ["S1", "S2", "S3"]
        total_co2 = df[scopes].sum(axis=1)
        for scope in scopes:
            df[scope] = (df[scope] * df["Total CO2 Emissions"] / total_co2).replace(
                np.nan, 0
            )
        df = df.drop(columns=transport_elements[1:])
    elif sheet == "Buildings":
        # Here we get to construct our very own benchmark data!
        # We note that OECM Buildings benchmark is just the sum of Residential and Commercial Sub-Benchmarks, so subsector has already selected
        # If we do production-centric, we just need to add S3 emissions to S1 and set S3 to zero
        if "Construction" in subsector:
            units.Production = ureg("billions USD").u
        else:
            units.Production = ureg("billions m**2").u
            if production_centric:
                df.S1 = df.S1 + df.S3
                df.S3 = 0
    elif sheet == "Energy":
        # Change benchmark basis of Oil from b/d to something easier to parse/use
        if subsector == "Oil":
            assert units.Production.endswith("/d")
            units.Production = units.Production.replace("/d", "")
            # It looks backwards to convert annual to daily, but it's 1/d -> 1/a so backwards it is!
            df.Production = df.Production * ureg("a").to("d").m

    # Now insert all the missing years we need to interpolate
    df = pd.DataFrame(benchmark_years).combine_first(df)
    # Change type at the end, as the addition of np.nan values can mess with the dtype (making it dtype 'object')
    for col in df.columns:
        df[col] = df[col].astype(f"pint[{units[col]}]")
    df.insert(0, "Sector", subsector)
    df.insert(0, "Region", region)
    df["S1S2"] = df.S1 + df.S2
    if not production_centric:
        df["S1S2S3"] = df.S1S2 + df.S3
    return interpolate_benchmark(df, sector_elements[6])

### Construct JSON benchmark structures

1.  Load Regional Workbook
2.  Process each Sector in the Workbook
3.  Convert resulting dataframe to dictionary structure
4.  Merge each Region/Sector dictionary into main benchmark dictionary

Note that we use linear interpolation when the overall interpolation is more than a 2:1 ratio start to finish
CAGR gets wonky both as the endpoint approaches zero (ratio becomes infinite); but it's also funky when slope is steep (though not infinitely steep)

In [5]:
bm_seed = {
    "benchmark_temperature": "1.5 delta_degC",
    "benchmark_global_budget": "396 Gt CO2",
    "is_AFOLU_included": False,
}

production_bm = bm_seed
# OECM defines both 'OECM' and 'Production-Centric' benchmarks
ei_bms = [bm_seed.copy(), bm_seed.copy()]

region_dict = {
    "Global": "OECM_Global_2022_04_22_Results",
    "Europe": "OECM_OECD_Europe_2022_04_22_results",
    "North America": "OECM_OECD_North_America_2022_04_22_results_0",
}


def merge_bm_dicts(main, new):
    for scope in new.keys():
        if not main.get(scope):
            main[scope] = new[scope]
            continue
        main[scope]["benchmarks"].append(new[scope]["benchmarks"][0])


benchmark_OECM_dir = os.path.abspath("../data/external/OECM 20220504")

oil_and_gas_dict = {}

for subsector, sector_elements in oecm_dict.items():
    sheet = sector_elements[1]
    ei_unit = sector_elements[6]
    for region, filename in region_dict.items():
        df = pd.read_excel(
            pathlib.Path(benchmark_OECM_dir, f"{filename}.xlsx"),
            sheet_name=sheet,
            dtype=str,
        )
        if sheet == "Energy" and subsector == "Gas" and region == "North America":
            print("Correcting...")
            # Correct a typo in input data (North American production-centric data miscalculated for Energy:Gas)
            print(
                f"df.iloc[121, 9] = {df.iloc[121, 9]}; df.iloc[127, 9] = {df.iloc[127, 9]}"
            )
            df.iloc[121, 9] = str(float(df.iloc[108, 9]) + float(df.iloc[112, 9]))
            df.iloc[127, 9] = str(float(df.iloc[114, 9]) + float(df.iloc[118, 9]))
            print(
                f"df.iloc[121, 9] = {df.iloc[121, 9]}; df.iloc[127, 9] = {df.iloc[127, 9]}"
            )
        orig_df = df.map(lambda x: x.rstrip(), na_action="ignore")
        print(f"Sector {subsector} Region {region}")

        for production_centric in [True, False]:
            df = process_sector_benchmark(
                orig_df, subsector, region, sector_elements, production_centric
            )
            if subsector in ["Oil", "Gas"]:
                oil_and_gas_dict[(subsector, region, production_centric)] = df
            # It's tempting to concatenate these DataFrames, but doing so wrecks the nice PintArrays created for Production and EI
            # So instead, build up the respective dictionaries with each dataframe we process

            bm_ei_scopes = {
                scope: {
                    "production_centric": production_centric,
                    "benchmarks": [
                        {
                            "sector": itr_dict[subsector],
                            "region": region,
                            "benchmark_metric": ei_unit,
                            "scenario name": "OECM 1.5 Degrees",
                            "release date": "2022",
                            "projections_nounits": [
                                {"year": year, "value": value.m}
                                for year, value in zip(df.index, df[f"EI_{scope}"])
                            ],
                        }
                    ],
                }
                for scope in ["S1", "S2", "S1S2", "S1S2S3"]
                if f"EI_{scope}" in df.columns
            }

            if "EI_S3" in df.columns:
                bm_ei_scopes["S3"] = {
                    "production_centric": production_centric,
                    "benchmarks": [
                        {
                            "sector": itr_dict[subsector],
                            "region": region,
                            "benchmark_metric": ei_unit,
                            "scenario name": "OECM 1.5 Degrees",
                            "release date": "2022",
                            "projections_nounits": [
                                {"year": year, "value": value.m}
                                for year, value in zip(df.index, df.EI_S3)
                            ],
                        }
                    ],
                }

            merge_bm_dicts(ei_bms[production_centric], bm_ei_scopes)

        # Production is not conditioned on scope--we shouldn't even need it!  It's also not dependent on "Production-centric"
        new_prod_bm = {
            scope: {
                "benchmarks": [
                    {
                        "sector": itr_dict[subsector],
                        "region": region,
                        "benchmark_metric": "dimensionless",
                        "scenario name": "OECM 1.5 Degrees",
                        "release date": "2022",
                        "base_year_production": str(df.Production.values[0]),
                        "projections_nounits": [
                            {"year": year, "value": value}
                            for year, value in zip(df.index, df.d_Production)
                        ],
                    }
                ]
            }
            for scope in ["AnyScope"]
        }
        merge_bm_dicts(production_bm, new_prod_bm)

Sector Materials / Steel Region Global
Sector Materials / Steel Region Europe
Sector Materials / Steel Region North America
Sector Power Utilities Region Global
Sector Power Utilities Region Europe
Sector Power Utilities Region North America
Sector Gas Utilities Region Global
Sector Gas Utilities Region Europe
Sector Gas Utilities Region North America
Sector Utilities Region Global
Sector Utilities Region Europe
Sector Utilities Region North America
Sector Coal Region Global
Sector Coal Region Europe
Sector Coal Region North America
Sector Oil Region Global
Sector Oil Region Europe
Sector Oil Region North America
Sector Gas Region Global
Sector Gas Region Europe
Correcting...
df.iloc[121, 9] = 46.656926192680956; df.iloc[127, 9] = 46.656926192680956
df.iloc[121, 9] = 479.2800288658678; df.iloc[127, 9] = 479.2800288658678
Sector Gas Region North America
Sector Energy Industry Region Global
Sector Energy Industry Region Europe
Sector Energy Industry Region North America
Sector Road: LDV 

### Oil & Gas

OECM separates `Energy` into `Coal`, `Oil`, and `Gas`.  But many users of the benchmark characterize companies as `Oil & Gas` companies, which is consistent with other benchmarks.  We synthesize an `Oil & Gas` sector for OECM here.

In [6]:
# From ITR.data.osc_units
# ureg.define("boe = 5.712 GJ")
ureg.define("boe = 6.1178632 GJ = BoE")
ureg.define("Mbbl = 1e3 bbl")
ureg.define("MMbbl = 1e6 bbl")
ureg.define("scf = ft**3")
ureg.define("mscf = 1000 scf = Mscf")
ureg.define("mmscf = 1000000 scf = MMscf")

oil = Context("oil")
oil.add_transformation(
    "[carbon] * [mass] ** 2 / [length] / [time] ** 2",
    "[carbon] * [mass]",
    lambda ureg, x: x * ureg("bbl/boe").to_reduced_units(),
)
# oil.add_transformation('boe', 'kg CO2e', lambda ureg, x: x * ureg('431.87 kg CO2e / boe')
oil.add_transformation("bbl", "boe", lambda ureg, x: x * ureg("boe") / ureg("bbl"))
oil.add_transformation("boe", "bbl", lambda ureg, x: x * ureg("bbl") / ureg("boe"))
# Converting intensity t CO2/bbl -> t CO2/boe
oil.add_transformation(
    "[carbon] * [mass] / [length] ** 3",
    "[carbon] * [time] ** 2 / [length] ** 2",
    lambda ureg, x: (x * ureg("bbl/boe")).to_reduced_units(),
)
oil.add_transformation(
    "[carbon] * [time] ** 2 / [length] ** 2",
    "[carbon] * [mass] / [length] ** 3",
    lambda ureg, x: (x * ureg("boe/bbl")).to_reduced_units(),
)
ureg.add_context(oil)

ureg.define("bcm = 1000000000 m**3")
NG_DENS = 0.7046 * ureg("kg CH4/(m**3 CH4)")  # 0.657
NG_SE = 54.84 * ureg("MJ/(kg CH4)")  # specific energy (energy per mass); range is 50-55
ng = Context("ngas")
ng.add_transformation("[volume] CH4", "[mass] CH4", lambda ureg, x: x * NG_DENS)
ng.add_transformation("[mass] CH4", "[volume] CH4", lambda ureg, x: x / NG_DENS)
ng.add_transformation("[volume] CH4 ", "[energy]", lambda ureg, x: x * NG_DENS * NG_SE)
ng.add_transformation("[energy]", "[volume] CH4", lambda ureg, x: x / (NG_DENS * NG_SE))
ng.add_transformation(
    "[carbon] * [length] * [methane] * [time] ** 2",
    "[carbon] * [mass]",
    lambda ureg, x: x * NG_DENS * NG_SE,
)
ng.add_transformation(
    "[carbon] * [mass] / [volume] / [methane]",
    "[carbon] * [mass] / [energy]",
    lambda ureg, x: x / (NG_DENS * NG_SE),
)
ng.add_transformation(
    "[carbon] * [time] ** 2 / [length] ** 2",
    "[carbon] * [mass] / [length] ** 3 / [methane]",
    lambda ureg, x: x * NG_DENS * NG_SE,
)

# Cannot convert from 'megawatt_hour / CH4 / mmscf' ([mass] / [length] / [methane] / [time] ** 2) to 'dimensionless' (dimensionless)
# conversion to dimensionless throws key error on '' in ureg

ng.add_transformation(
    "Mscf CH4", "kg CO2e", lambda ureg, x: x * ureg("54.87 kg CO2e / (Mscf CH4)")
)
ng.add_transformation(
    "g CH4", "g CO2e", lambda ureg, x: x * ureg("44 g CO2e / (16 g CH4)")
)
ureg.add_context(ng)

ureg.enable_contexts("oil", "ngas")

In [7]:
for production_centric in [True, False]:
    df_all = pd.concat(
        [v for k, v in oil_and_gas_dict.items() if k[2] is production_centric]
    )
    for region in df_all.Region.unique():
        df = df_all[df_all.Region == region]
        df_oil = df[df.Sector == "Oil"].copy()
        df_oil.Production = df_oil.Production.astype(
            f"pint[{str(df_oil.Production.values[0].u)}]"
        )
        df_gas = df[df.Sector == "Gas"].copy()
        df_gas.Production = df_gas.Production.astype(
            f"pint[{str(df_gas.Production.values[0].u)}]"
        )
        o_and_g_em_unit = "t CO2e"
        o_and_g_prod_unit = "TJ"
        o_and_g_bm_unit = f"{o_and_g_em_unit}/{o_and_g_prod_unit}"
        bm_ei_scopes = {
            scope: {
                "production_centric": production_centric,
                "benchmarks": [
                    {
                        "sector": "Oil & Gas",
                        "region": region,
                        "benchmark_metric": o_and_g_bm_unit,
                        "scenario name": "OECM 1.5 Degrees",
                        "release date": "2022",
                        "projections_nounits": [
                            {
                                "year": year,
                                "value": (
                                    oil_em.m_as(o_and_g_em_unit)
                                    + gas_em.m_as(o_and_g_em_unit)
                                )
                                / (
                                    oil_prod.m_as(o_and_g_prod_unit)
                                    + gas_prod.m_as(o_and_g_prod_unit)
                                ),
                            }
                            for year, oil_em, oil_prod, gas_em, gas_prod in zip(
                                df_oil.index,
                                df_oil[scope],
                                df_oil.Production,
                                df_gas[scope],
                                df_gas.Production,
                            )
                        ],
                    }
                ],
            }
            for scope in ["S1", "S2", "S1S2", "S1S2S3"]
            if f"EI_{scope}" in df.columns
        }

        if "EI_S3" in df.columns:
            bm_ei_scopes["S3"] = {
                "production_centric": production_centric,
                "benchmarks": [
                    {
                        "sector": "Oil & Gas",
                        "region": region,
                        "benchmark_metric": o_and_g_bm_unit,
                        "scenario name": "OECM 1.5 Degrees",
                        "release date": "2022",
                        "projections_nounits": [
                            {
                                "year": year,
                                "value": (
                                    oil_em.m_as(o_and_g_em_unit)
                                    + gas_em.m_as(o_and_g_em_unit)
                                )
                                / (
                                    oil_prod.m_as(o_and_g_prod_unit)
                                    + gas_prod.m_as(o_and_g_prod_unit)
                                ),
                            }
                            for year, oil_em, oil_prod, gas_em, gas_prod in zip(
                                df_oil.index,
                                df_oil["S3"],
                                df_oil.Production,
                                df_gas["S3"],
                                df_gas.Production,
                            )
                        ],
                    }
                ],
            }

        merge_bm_dicts(ei_bms[production_centric], bm_ei_scopes)

        # Production is not conditioned on scope--we shouldn't even need it!  It's also not dependent on "Production-centric", despite appearances
        # Alas, we have to re-synthesize the year-over-year growth rate based on the sum PJ of the two components

        if not production_centric:
            base_prod = df_oil.Production.values[0].m_as(
                "PJ"
            ) + df_gas.Production.values[0].m_as("PJ")
            prod_series = (
                df_oil.Production.pint.m_as("PJ").add(df_gas.Production.pint.m_as("PJ"))
                / base_prod
            )
            prod_delta = prod_series.div(prod_series.shift(1))
            prod_delta.iloc[0] = 1.0
            prod_delta = prod_delta.sub(1.0)
            new_prod_bm = {
                scope: {
                    "benchmarks": [
                        {
                            "sector": "Oil & Gas",
                            "region": region,
                            "benchmark_metric": "dimensionless",
                            "scenario name": "OECM 1.5 Degrees",
                            "release date": "2022",
                            "base_year_production": f"{base_prod} PJ",
                            "projections_nounits": [
                                {"year": year, "value": value}
                                for year, value in zip(
                                    prod_delta.index, prod_delta.values
                                )
                            ],
                        }
                    ]
                }
                for scope in ["AnyScope"]
            }
            merge_bm_dicts(production_bm, new_prod_bm)

### Emit Sector Benchmark Data

In [8]:
output_datadir = os.path.abspath("../data/processed/OECM 20220504")
pathlib.Path(output_datadir).mkdir(parents=True, exist_ok=True)

In [9]:
# https://til.simonwillison.net/python/json-floating-point
# Modified to blend the concept of "precision after the decimal point" with "significant figures" (SF).
# For numbers in (-1,1), gives PRECISION=3 sig figs.  For numbers outside that range, but within (-10,10), an addition SF.
# Will provide up to PRECISION-1 additional SFs (default 2) for larger absolute magnitudes.


# from math import log10
def round_floats(o, precision=3):
    if isinstance(o, float):
        if o == 0:
            return 0
        if np.isinf(o) or np.isnan(o):
            return f"{o}"
        lo = int(log10(abs(o))) - (abs(o) > 10)
        if precision + lo < 0:
            return 0
        if precision * 2 < lo:
            return round(o)
        return round(o, precision - lo)
    if isinstance(o, dict):
        return {k: round_floats(v, precision) for k, v in o.items()}
    if isinstance(o, (list, tuple)):
        return [round_floats(x, precision) for x in o]
    return o


with open(pathlib.Path(output_datadir, "benchmark_production_OECM.json"), "w") as f:
    json.dump(round_floats(production_bm), sort_keys=False, indent=2, fp=f)
    print("", file=f)

with open(pathlib.Path(output_datadir, "benchmark_EI_OECM_S3.json"), "w") as f:
    json.dump(round_floats(ei_bms[False]), sort_keys=False, indent=2, fp=f)
    print("", file=f)
with open(pathlib.Path(output_datadir, "benchmark_EI_OECM_PC.json"), "w") as f:
    json.dump(round_floats(ei_bms[True]), sort_keys=False, indent=2, fp=f)
    print("", file=f)

In [10]:
production_index = pd.MultiIndex.from_tuples(
    [
        (
            production_bm[scope]["benchmarks"][bm]["sector"],
            production_bm[scope]["benchmarks"][bm]["region"],
            production_bm[scope]["benchmarks"][bm]["benchmark_metric"],
            scope,
            bm,
        )
        for scope in ["AnyScope"]
        for bm in range(len(production_bm[scope]["benchmarks"]))
    ],
    names=["sector", "region", "benchmark_metric", "scope", "bm_idx"],
)
df_production = pd.DataFrame.from_dict(
    {
        (idx[0], idx[1], idx[2], idx[3]): {
            projection["year"]: projection["value"]
            for projection in production_bm[idx[3]]["benchmarks"][idx[4]][
                "projections_nounits"
            ]
        }
        for idx in production_index
    },
    orient="index",
)
df_production.index = production_index.droplevel("bm_idx")
df_production.sort_index(inplace=True)

In [11]:
benchmark_scopes = [["S1", "S2", "S1S2", "S3", "S1S2S3"], ["S1", "S2", "S1S2"]]

for wb_filename, production_centric in [
    ("benchmark_OECM_S3", False),
    ("benchmark_OECM_PC", True),
]:
    ei_index = pd.MultiIndex.from_tuples(
        [
            (
                ei_bms[production_centric][scope]["benchmarks"][bm]["sector"],
                ei_bms[production_centric][scope]["benchmarks"][bm]["region"],
                ei_bms[production_centric][scope]["benchmarks"][bm]["benchmark_metric"],
                scope,
                bm,
            )
            for scope in benchmark_scopes[production_centric]
            for bm in range(len(ei_bms[production_centric][scope]["benchmarks"]))
        ],
        names=["sector", "region", "benchmark_metric", "scope", "bm_idx"],
    )
    df_ei = pd.DataFrame.from_dict(
        {
            (idx[0], idx[1], idx[2], idx[3]): {
                projection["year"]: projection["value"]
                for projection in ei_bms[production_centric][idx[3]]["benchmarks"][
                    idx[4]
                ]["projections_nounits"]
            }
            for idx in ei_index
        },
        orient="index",
    )
    df_ei.index = ei_index.droplevel("bm_idx")
    df_ei.sort_index(inplace=True)

    wb_sheets = {"projected_production": df_production, "projected_ei": df_ei}

    with pd.ExcelWriter(pathlib.Path(output_datadir, f"{wb_filename}.xlsx")) as writer:
        df_production.to_excel(writer, sheet_name="projected_production")
        df_ei.to_excel(writer, sheet_name="projected_ei")

In [12]:
# wb = pd.read_excel(pathlib.Path(output_datadir, 'benchmark_EI_OECM_S3.xlsx'), sheet_name=None)

In [13]:
# wb['projected_ei'].fillna(method="ffill")